397 lines
16 KiB
C++
397 lines
16 KiB
C++
// Copyright 2011 The Chromium Authors
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#include "base/i18n/string_search.h"
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "base/i18n/rtl.h"
|
|
#include "base/strings/utf_string_conversions.h"
|
|
#include "testing/gtest/include/gtest/gtest.h"
|
|
#include "third_party/icu/source/i18n/unicode/usearch.h"
|
|
|
|
namespace base {
|
|
namespace i18n {
|
|
|
|
#define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \
|
|
&length)); \
|
|
EXPECT_EQ(ex_start, index); \
|
|
EXPECT_EQ(ex_len, length); \
|
|
index = 0; \
|
|
length = 0; \
|
|
EXPECT_TRUE( \
|
|
StringSearch(find_this, in_this, &index, &length, false, true)); \
|
|
EXPECT_EQ(ex_start, index); \
|
|
EXPECT_EQ(ex_len, length); \
|
|
}
|
|
|
|
#define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_TRUE( \
|
|
StringSearch(find_this, in_this, &index, &length, true, true)); \
|
|
EXPECT_EQ(ex_start, index); \
|
|
EXPECT_EQ(ex_len, length); \
|
|
}
|
|
|
|
#define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start, \
|
|
ex_len) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_TRUE( \
|
|
StringSearch(find_this, in_this, &index, &length, false, false)); \
|
|
EXPECT_EQ(ex_start, index); \
|
|
EXPECT_EQ(ex_len, length); \
|
|
}
|
|
|
|
#define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_TRUE( \
|
|
StringSearch(find_this, in_this, &index, &length, true, false)); \
|
|
EXPECT_EQ(ex_start, index); \
|
|
EXPECT_EQ(ex_len, length); \
|
|
}
|
|
|
|
#define EXPECT_MISS_IGNORE_CASE(find_this, in_this) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this, \
|
|
&index, &length)); \
|
|
index = 0; \
|
|
length = 0; \
|
|
EXPECT_FALSE( \
|
|
StringSearch(find_this, in_this, &index, &length, false, true)); \
|
|
}
|
|
|
|
#define EXPECT_MISS_SENSITIVE(find_this, in_this) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_FALSE( \
|
|
StringSearch(find_this, in_this, &index, &length, true, true)); \
|
|
}
|
|
|
|
#define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_FALSE( \
|
|
StringSearch(find_this, in_this, &index, &length, false, false)); \
|
|
}
|
|
|
|
#define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this) \
|
|
{ \
|
|
size_t index = 0; \
|
|
size_t length = 0; \
|
|
EXPECT_FALSE( \
|
|
StringSearch(find_this, in_this, &index, &length, true, false)); \
|
|
}
|
|
|
|
// Note on setting default locale for testing: The current default locale on
|
|
// the Mac trybot is en_US_POSIX, with which primary-level collation strength
|
|
// string search is case-sensitive, when normally it should be
|
|
// case-insensitive. In other locales (including en_US which English speakers
|
|
// in the U.S. use), this search would be case-insensitive as expected.
|
|
|
|
TEST(StringSearchTest, ASCII) {
|
|
std::string default_locale(uloc_getDefault());
|
|
bool locale_is_posix = (default_locale == "en_US_POSIX");
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale("en_US");
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U);
|
|
|
|
EXPECT_MISS_IGNORE_CASE(u"h e l l o", u"h e l l o");
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
|
|
|
|
EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U,
|
|
0U);
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U,
|
|
18U);
|
|
|
|
EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
|
|
|
|
EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string());
|
|
|
|
EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U,
|
|
0U);
|
|
|
|
EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY");
|
|
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale(default_locale.data());
|
|
}
|
|
|
|
TEST(StringSearchTest, UnicodeLocaleIndependent) {
|
|
// Base characters
|
|
const std::u16string e_base = u"e";
|
|
const std::u16string E_base = u"E";
|
|
const std::u16string a_base = u"a";
|
|
|
|
// Composed characters
|
|
const std::u16string e_with_acute_accent = u"\u00e9";
|
|
const std::u16string E_with_acute_accent = u"\u00c9";
|
|
const std::u16string e_with_grave_accent = u"\u00e8";
|
|
const std::u16string E_with_grave_accent = u"\u00c8";
|
|
const std::u16string a_with_acute_accent = u"\u00e1";
|
|
|
|
// Decomposed characters
|
|
const std::u16string e_with_acute_combining_mark = u"e\u0301";
|
|
const std::u16string E_with_acute_combining_mark = u"E\u0301";
|
|
const std::u16string e_with_grave_combining_mark = u"e\u0300";
|
|
const std::u16string E_with_grave_combining_mark = u"E\u0300";
|
|
const std::u16string a_with_acute_combining_mark = u"a\u0301";
|
|
|
|
std::string default_locale(uloc_getDefault());
|
|
bool locale_is_posix = (default_locale == "en_US_POSIX");
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale("en_US");
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U,
|
|
e_with_acute_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U,
|
|
e_with_acute_combining_mark.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U,
|
|
e_base.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
|
|
e_with_acute_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
|
|
e_with_acute_combining_mark.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark,
|
|
e_with_grave_combining_mark, 0U,
|
|
e_with_grave_combining_mark.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark,
|
|
e_with_acute_combining_mark, 0U,
|
|
e_with_acute_combining_mark.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U,
|
|
e_with_grave_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U,
|
|
e_with_acute_combining_mark.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U,
|
|
e_with_acute_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U,
|
|
e_with_acute_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U,
|
|
e_with_grave_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U,
|
|
e_with_acute_accent.size());
|
|
|
|
EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U,
|
|
e_with_grave_accent.size());
|
|
|
|
EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark,
|
|
e_with_acute_combining_mark);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base);
|
|
|
|
EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
|
|
1U);
|
|
|
|
EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
|
|
2U);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark,
|
|
e_with_grave_combining_mark);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark,
|
|
e_with_acute_combining_mark);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark);
|
|
|
|
EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent);
|
|
|
|
EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark,
|
|
e_with_acute_combining_mark);
|
|
|
|
EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark,
|
|
a_with_acute_combining_mark, 0U, 2U);
|
|
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale(default_locale.data());
|
|
}
|
|
|
|
TEST(StringSearchTest, UnicodeLocaleDependent) {
|
|
// Base characters
|
|
const std::u16string a_base = u"a";
|
|
|
|
// Composed characters
|
|
const std::u16string a_with_ring = u"\u00e5";
|
|
|
|
EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
|
|
nullptr));
|
|
EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
|
|
|
|
const char* default_locale = uloc_getDefault();
|
|
SetICUDefaultLocale("da");
|
|
|
|
EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
|
|
nullptr));
|
|
EXPECT_FALSE(
|
|
StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
|
|
|
|
SetICUDefaultLocale(default_locale);
|
|
}
|
|
|
|
TEST(StringSearchTest, SearchBackwards) {
|
|
std::string default_locale(uloc_getDefault());
|
|
bool locale_is_posix = (default_locale == "en_US_POSIX");
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale("en_US");
|
|
|
|
EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U);
|
|
EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U);
|
|
EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB");
|
|
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale(default_locale.data());
|
|
}
|
|
|
|
TEST(StringSearchTest, FixedPatternMultipleSearch) {
|
|
std::string default_locale(uloc_getDefault());
|
|
bool locale_is_posix = (default_locale == "en_US_POSIX");
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale("en_US");
|
|
|
|
size_t index = 0;
|
|
size_t length = 0;
|
|
|
|
// Search "foo" over multiple texts.
|
|
FixedPatternStringSearch query1(u"foo", true);
|
|
EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true));
|
|
EXPECT_EQ(2U, index);
|
|
EXPECT_EQ(3U, length);
|
|
EXPECT_FALSE(query1.Search(u"bye", &index, &length, true));
|
|
EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true));
|
|
EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true));
|
|
EXPECT_EQ(0U, index);
|
|
EXPECT_EQ(3U, length);
|
|
EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false));
|
|
EXPECT_EQ(6U, index);
|
|
EXPECT_EQ(3U, length);
|
|
|
|
// Search "hello" over multiple texts.
|
|
FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello");
|
|
EXPECT_TRUE(query2.Search(u"12hello34", &index, &length));
|
|
EXPECT_EQ(2U, index);
|
|
EXPECT_EQ(5U, length);
|
|
EXPECT_FALSE(query2.Search(u"bye", &index, &length));
|
|
EXPECT_TRUE(query2.Search(u"hELLo", &index, &length));
|
|
EXPECT_EQ(0U, index);
|
|
EXPECT_EQ(5U, length);
|
|
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale(default_locale.data());
|
|
}
|
|
|
|
TEST(StringSearchTest, RepeatingStringSearch) {
|
|
struct MatchResult {
|
|
int match_index;
|
|
int match_length;
|
|
};
|
|
|
|
std::string default_locale(uloc_getDefault());
|
|
bool locale_is_posix = (default_locale == "en_US_POSIX");
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale("en_US");
|
|
|
|
const char16_t kPattern[] = u"fox";
|
|
const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox";
|
|
|
|
// Case sensitive.
|
|
{
|
|
const MatchResult kExpectation[] = {{16, 3}};
|
|
|
|
RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true);
|
|
std::vector<MatchResult> results;
|
|
int match_index;
|
|
int match_length;
|
|
while (searcher.NextMatchResult(match_index, match_length)) {
|
|
results.push_back(
|
|
{.match_index = match_index, .match_length = match_length});
|
|
}
|
|
|
|
ASSERT_EQ(std::size(kExpectation), results.size());
|
|
for (size_t i = 0; i < results.size(); ++i) {
|
|
EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
|
|
EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
|
|
}
|
|
}
|
|
|
|
// Case insensitive.
|
|
{
|
|
const MatchResult kExpectation[] = {{16, 3}, {41, 3}};
|
|
|
|
RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false);
|
|
std::vector<MatchResult> results;
|
|
int match_index;
|
|
int match_length;
|
|
while (searcher.NextMatchResult(match_index, match_length)) {
|
|
results.push_back(
|
|
{.match_index = match_index, .match_length = match_length});
|
|
}
|
|
|
|
ASSERT_EQ(std::size(kExpectation), results.size());
|
|
for (size_t i = 0; i < results.size(); ++i) {
|
|
EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
|
|
EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
|
|
}
|
|
}
|
|
|
|
if (locale_is_posix)
|
|
SetICUDefaultLocale(default_locale.data());
|
|
}
|
|
|
|
} // namespace i18n
|
|
} // namespace base
|