123 lines
6.7 KiB
Diff
123 lines
6.7 KiB
Diff
|
|
diff --git a/source/data/brkitr/rules/word.txt b/source/data/brkitr/rules/word.txt
|
||
|
|
index 0f0e734d27c0..b46038237859 100644
|
||
|
|
--- a/source/data/brkitr/rules/word.txt
|
||
|
|
+++ b/source/data/brkitr/rules/word.txt
|
||
|
|
@@ -38,12 +38,34 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||
|
|
$Format = [\p{Word_Break = Format}];
|
||
|
|
$Katakana = [\p{Word_Break = Katakana}];
|
||
|
|
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||
|
|
-$ALetter = [\p{Word_Break = ALetter} @];
|
||
|
|
+
|
||
|
|
+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
|
||
|
|
+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
|
||
|
|
+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
|
||
|
|
+# light of the Chromium-specific change below that breaks on full-stop (period,
|
||
|
|
+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
|
||
|
|
+# Thus, although the upstream intent was to not break "user.name@example.com" at
|
||
|
|
+# all, it actually would break down into {"user", ".", "name@example", ".",
|
||
|
|
+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
|
||
|
|
+# previous Chromium behavior of breaking at both '@' and '.'.
|
||
|
|
+#
|
||
|
|
+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
|
||
|
|
+# (and thus this patch) for '.' and now '@'.
|
||
|
|
+$ALetter = [\p{Word_Break = ALetter}];
|
||
|
|
+
|
||
|
|
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||
|
|
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||
|
|
-$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||
|
|
+
|
||
|
|
+# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
||
|
|
+# to break a hostname into its components at the cost of breaking
|
||
|
|
+# 'e.g.' and 'i.e.' as well.
|
||
|
|
+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
||
|
|
+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
||
|
|
+# while rules 6/7 are reverted to the old behavior we want.
|
||
|
|
+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
||
|
|
$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
|
||
|
|
-$MidNum = [\p{Word_Break = MidNum}];
|
||
|
|
+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
||
|
|
+
|
||
|
|
$Numeric = [\p{Word_Break = Numeric}];
|
||
|
|
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||
|
|
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||
|
|
diff --git a/source/data/brkitr/rules/word_POSIX.txt b/source/data/brkitr/rules/word_POSIX.txt
|
||
|
|
index e62fd7fac3b1..1115b464d835 100644
|
||
|
|
--- a/source/data/brkitr/rules/word_POSIX.txt
|
||
|
|
+++ b/source/data/brkitr/rules/word_POSIX.txt
|
||
|
|
@@ -38,12 +38,30 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||
|
|
$Format = [\p{Word_Break = Format}];
|
||
|
|
$Katakana = [\p{Word_Break = Katakana}];
|
||
|
|
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||
|
|
-$ALetter = [\p{Word_Break = ALetter} @];
|
||
|
|
+
|
||
|
|
+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
|
||
|
|
+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
|
||
|
|
+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
|
||
|
|
+# light of the Chromium-specific change below that breaks on full-stop (period,
|
||
|
|
+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
|
||
|
|
+# Thus, although the upstream intent was to not break "user.name@example.com" at
|
||
|
|
+# all, it actually would break down into {"user", ".", "name@example", ".",
|
||
|
|
+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
|
||
|
|
+# previous Chromium behavior of breaking at both '@' and '.'.
|
||
|
|
+#
|
||
|
|
+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
|
||
|
|
+# (and thus this patch) for '.' and now '@'.
|
||
|
|
+$ALetter = [\p{Word_Break = ALetter}];
|
||
|
|
+
|
||
|
|
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||
|
|
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||
|
|
-$MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||
|
|
+
|
||
|
|
+# Remove full-width full stop (\uff0e) from $MidNumLet and add it to $MidNum, in
|
||
|
|
+# addition to the ordinary full stop (dot, period, '.', \u002e).
|
||
|
|
+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
||
|
|
$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
|
||
|
|
-$MidNum = [\p{Word_Break = MidNum} [.]];
|
||
|
|
+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
||
|
|
+
|
||
|
|
$Numeric = [\p{Word_Break = Numeric}];
|
||
|
|
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||
|
|
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||
|
|
diff --git a/source/data/brkitr/rules/word_fi_sv.txt b/source/data/brkitr/rules/word_fi_sv.txt
|
||
|
|
index 544558f91adf..da79385318d0 100644
|
||
|
|
--- a/source/data/brkitr/rules/word_fi_sv.txt
|
||
|
|
+++ b/source/data/brkitr/rules/word_fi_sv.txt
|
||
|
|
@@ -38,12 +38,34 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||
|
|
$Format = [\p{Word_Break = Format}];
|
||
|
|
$Katakana = [\p{Word_Break = Katakana}];
|
||
|
|
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||
|
|
-$ALetter = [\p{Word_Break = ALetter} @];
|
||
|
|
+
|
||
|
|
+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
|
||
|
|
+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
|
||
|
|
+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
|
||
|
|
+# light of the Chromium-specific change below that breaks on full-stop (period,
|
||
|
|
+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
|
||
|
|
+# Thus, although the upstream intent was to not break "user.name@example.com" at
|
||
|
|
+# all, it actually would break down into {"user", ".", "name@example", ".",
|
||
|
|
+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
|
||
|
|
+# previous Chromium behavior of breaking at both '@' and '.'.
|
||
|
|
+#
|
||
|
|
+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
|
||
|
|
+# (and thus this patch) for '.' and now '@'.
|
||
|
|
+$ALetter = [\p{Word_Break = ALetter}];
|
||
|
|
+
|
||
|
|
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||
|
|
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||
|
|
-$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||
|
|
+
|
||
|
|
+# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
||
|
|
+# to break a hostname into its components at the cost of breaking
|
||
|
|
+# 'e.g.' and 'i.e.' as well.
|
||
|
|
+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
||
|
|
+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
||
|
|
+# while rules 6/7 are reverted to the old behavior we want.
|
||
|
|
+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
||
|
|
$MidLetter = [\p{Word_Break = MidLetter}];
|
||
|
|
-$MidNum = [\p{Word_Break = MidNum}];
|
||
|
|
+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
||
|
|
+
|
||
|
|
$Numeric = [\p{Word_Break = Numeric}];
|
||
|
|
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||
|
|
$WSegSpace = [\p{Word_Break = WSegSpace}];
|