unplugged-system/external/cronet/base/i18n/break_iterator_unittest.cc

634 lines
22 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/break_iterator.h"
#include <stddef.h>
#include <vector>
#include "base/ranges/algorithm.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace base {
namespace i18n {
TEST(BreakIteratorTest, BreakWordEmpty) {
std::u16string empty;
BreakIterator iter(empty, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWord) {
std::u16string space(u" ");
std::u16string str(u" foo bar! \npouet boom");
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(u"foo", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(u"bar", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"!", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"\n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(u"pouet", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(u"boom", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWordWide16) {
// Two greek words separated by space.
const std::u16string str(
u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
const std::u16string word1(str.substr(0, 10));
const std::u16string word2(str.substr(11, 5));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(word1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u" ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(word2, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWordWide32) {
const std::u16string str = u"\U0001d49c a";
const std::u16string very_wide_word(str.substr(0, 2));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(very_wide_word, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u" ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(u"a", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWordThai) {
// Terms in Thai, without spaces in between.
const char16_t term1[] = u"พิมพ์";
const char16_t term2[] = u"น้อย";
const char16_t term3[] = u"ลง";
const std::u16string str(base::JoinString({term1, term2, term3}, u""));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term2, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term3, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
}
// In some languages, the words are not broken by spaces. ICU provides a huge
// dictionary to detect word boundaries in Thai, Chinese, Japanese, Burmese,
// and Khmer. Due to the size of such a table, the part for Chinese and
// Japanese is not shipped on mobile.
#if !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
TEST(BreakIteratorTest, BreakWordChinese) {
// Terms in Traditional Chinese, without spaces in between.
const char16_t term1[] = u"瀏覽";
const char16_t term2[] = u"速度";
const char16_t term3[] = u"飛快";
const std::u16string str(base::JoinString({term1, term2, term3}, u""));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term2, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term3, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWordJapanese) {
// Terms in Japanese, without spaces in between.
const char16_t term1[] = u"モバイル";
const char16_t term2[] = u"でも";
const std::u16string str(base::JoinString({term1, term2}, u""));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(term2, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakWordChineseEnglish) {
// Terms in Simplified Chinese mixed with English and wide punctuations.
std::u16string space(u" ");
const char16_t token1[] = u"下载";
const char16_t token2[] = u"Chrome";
const char16_t token3[] = u"";
const char16_t token4[] = u"Mac";
const char16_t token5[] = u"";
const char16_t token6[] = u"";
const std::u16string str(base::JoinString(
{token1, u" ", token2, token3, token4, u" ", token5, token6}, u""));
BreakIterator iter(str, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(token1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(token2, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(token3, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(token4, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(space, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(token5, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(token6, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
}
#endif // !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
TEST(BreakIteratorTest, BreakSpaceEmpty) {
std::u16string empty;
BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSpace) {
std::u16string str(u" foo bar! \npouet boom");
BreakIterator iter(str, BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u" ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"foo ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"bar! \n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"pouet ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"boom", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSpaceSP) {
std::u16string str(u" foo bar! \npouet boom ");
BreakIterator iter(str, BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u" ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"foo ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"bar! \n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"pouet ", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"boom ", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSpacekWide16) {
// Two Greek words.
const std::u16string str(
u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
const std::u16string word1(str.substr(0, 11));
const std::u16string word2(str.substr(11, 5));
BreakIterator iter(str, BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(word1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(word2, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSpaceWide32) {
const std::u16string str = u"\U0001d49c a";
const std::u16string very_wide_word(str.substr(0, 3));
BreakIterator iter(str, BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(very_wide_word, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"a", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakLineEmpty) {
std::u16string empty;
BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
ASSERT_TRUE(iter.Init());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakLine) {
std::u16string nl(u"\n");
std::u16string str(u"\nfoo bar!\n\npouet boom");
BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"foo bar!\n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"pouet boom", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSentence) {
std::u16string nl(u"\n");
std::u16string str(
u"\nFoo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"Foo bar!\n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"One sentence.\n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"\tAnother sentence?", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"One more thing", iter.GetString());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, IsSentenceBoundary) {
std::u16string str(
u"Foo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
ASSERT_TRUE(iter.Init());
std::vector<size_t> sentence_breaks;
sentence_breaks.push_back(0);
sentence_breaks.push_back(9);
sentence_breaks.push_back(23);
sentence_breaks.push_back(24);
sentence_breaks.push_back(42);
for (size_t i = 0; i < str.size(); i++) {
if (ranges::find(sentence_breaks, i) != sentence_breaks.end()) {
EXPECT_TRUE(iter.IsSentenceBoundary(i)) << " at index=" << i;
} else {
EXPECT_FALSE(iter.IsSentenceBoundary(i)) << " at index=" << i;
}
}
}
TEST(BreakIteratorTest, BreakLineNL) {
std::u16string nl(u"\n");
std::u16string str(u"\nfoo bar!\n\npouet boom\n");
BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"foo bar!\n", iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(nl, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"pouet boom\n", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakLineWide16) {
// Two Greek words separated by newline.
const std::u16string str(
u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
u"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2");
const std::u16string line1(str.substr(0, 11));
const std::u16string line2(str.substr(11, 5));
BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(line1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(line2, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakLineWide32) {
const std::u16string str = u"\U0001d49c\na";
const std::u16string very_wide_line(str.substr(0, 3));
BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(very_wide_line, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(u"a", iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakCharacter) {
static const char16_t* const kCharacters[] = {
// An English word consisting of four ASCII characters.
u"w",
u"o",
u"r",
u"d",
u" ",
// A Hindi word (which means "Hindi") consisting of two Devanagari
// grapheme clusters.
u"हि",
u"न्दी",
u" ",
// A Thai word (which means "feel") consisting of three Thai grapheme
// clusters.
u"รู้",
u"สึ",
u"",
u" ",
};
std::vector<std::u16string> characters;
std::u16string text;
for (const auto* i : kCharacters) {
characters.push_back(i);
text.append(characters.back());
}
BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
ASSERT_TRUE(iter.Init());
for (size_t i = 0; i < std::size(kCharacters); ++i) {
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(characters[i], iter.GetString());
}
}
// Test for https://code.google.com/p/chromium/issues/detail?id=411213
// We should be able to get valid substrings with GetString() function
// after setting new content by calling SetText().
TEST(BreakIteratorTest, GetStringAfterSetText) {
const std::u16string initial_string(u"str");
BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
const std::u16string long_string(u"another,string");
EXPECT_TRUE(iter.SetText(long_string.c_str(), long_string.size()));
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.Advance()); // Advance to ',' in |long_string|
// Check that the current position is out of bounds of the |initial_string|.
EXPECT_LT(initial_string.size(), iter.pos());
// Check that we can get a valid substring of |long_string|.
EXPECT_EQ(u",", iter.GetString());
}
TEST(BreakIteratorTest, GetStringPiece) {
const std::u16string initial_string(u"some string");
BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
EXPECT_EQ(StringPiece16(u"some"), iter.GetStringPiece());
EXPECT_TRUE(iter.Advance());
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
EXPECT_EQ(StringPiece16(u"string"), iter.GetStringPiece());
}
// Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
// IS_LINE_OR_CHAR_BREAK.
TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
// A string containing the English word "foo", followed by two Khmer
// characters, the English word "Can", and then two Russian characters and
// punctuation.
std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
BreakIterator iter(text, BreakIterator::BREAK_LINE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
// Finds "foo" and the space.
EXPECT_EQ(u"foo ", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the Khmer characters, the next space, and the newline.
EXPECT_EQ(u"\x1791\x17C1 \n", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds "Can" and the space.
EXPECT_EQ(u"Can ", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the Russian characters and periods.
EXPECT_EQ(u"\x041C\x0438...", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
EXPECT_FALSE(iter.Advance());
}
// Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
// IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
// finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
// be returned on punctuation and spaces.
TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
// A string containing the English word "foo", followed by two Khmer
// characters, the English word "Can", and then two Russian characters and
// punctuation.
std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
BreakIterator iter(text, BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
// Finds "foo".
EXPECT_EQ(u"foo", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the space, and the Khmer characters.
EXPECT_EQ(u" ", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(u"\x1791\x17C1", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the space and the newline.
EXPECT_EQ(u" ", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(u"\n", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
// Finds "Can".
EXPECT_EQ(u"Can", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the space and the Russian characters.
EXPECT_EQ(u" ", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(u"\x041C\x0438", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
EXPECT_TRUE(iter.Advance());
// Finds the trailing periods.
EXPECT_EQ(u".", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(u".", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_TRUE(iter.Advance());
EXPECT_EQ(u".", iter.GetString());
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_FALSE(iter.Advance());
}
} // namespace i18n
} // namespace base