Proposed: StringUTF16 bug fix with optimization - Part 2 of 2, Unit Tests
Chris Johnson
chriswjohnson.jdk at gmail.com
Tue Mar 30 23:45:11 UTC 2021
This is a patch for test class "CompareIC", providing 100% unit test
coverage of the fixed "java.lang.StringUTF16" methods "compareToCI" and
"regionMatchesCI" in part 1 of this proposed contribution.
The tests also provide 100% coverage of the current implementations of
those methods, and, if run against them, will reveal the pair of small
bugs detailed in part 1.
These tests fill a JDK test coverage gap that allowed the lack of
support for case-insensitive comparison and equality testing of strings
containing case-sensitive Supplementary Multilingual Plane code-points
to (apparently) go unnoticed officially until last year.
As such, these tests are of value to the JDK even if my proposed
revisions of "compareToCI" and "regionMatchesCI" are unacceptable.
Thanks again for your consideration,
----Chris
Chris W. Johnson
chriswjohnson.jdk at gmail.com
http://www.panojohnson.com/
Index: test/jdk/java/lang/String/CompareIC.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>US-ASCII
===================================================================
diff --git a/test/jdk/java/lang/String/CompareIC.java
b/test/jdk/java/lang/String/CompareIC.java
--- a/test/jdk/java/lang/String/CompareIC.java (revision
60819:ee1d592a9f5389725a0338a4b5dfcf4fc3fcf20c)
+++ b/test/jdk/java/lang/String/CompareIC.java (revision
60819+:ee1d592a9f53+)
@@ -24,41 +24,1544 @@
/*
* @test
* @bug 4124769 8160312
- * @summary Test ignore-case comparison
- *
+ * @summary Test case-insensitive comparison and equality
+ * @run testng/othervm -XX:+CompactStrings CompareIC
+ * @run testng/othervm -XX:-CompactStrings CompareIC
*/
-import java.net.*;
-import java.io.InputStream;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.fail;
+
+import org.testng.annotations.Test;
+
import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import java.util.Formatter;
+import java.util.function.BiFunction;
+import java.util.function.IntFunction;
+import java.util.function.Predicate;
+
+/**
+ * <p>{@code CompareIC} provides generalized unit tests of {@link String}
methods {@link String#equalsIgnoreCase
+ * equalsIgnoreCase} and {@link String#compareToIgnoreCase
compareToIgnoreCase} with an emphasis on exercising
+ * underlying methods {@code java.lang.StringUTF16#compareToCI} and {@code
java.lang.StringUTF16#regionMatchesCI}.
+ * It also:
+ * </p>
+ * <ul>
+ * <li>Tests every case-sensitive Unicode code-point for equality to
each of its case variants, using
+ * both {@link #equalsIgnoreCase} and {@link #compareToIgnoreCase}.
+ * </li>
+ * <li>Provides 100% test coverage of the CWJ implementations of
{@code StringUTF16} methods {@code
+ * compareToCI}, {@code regionMatchesCI} and {@code
compareCodePointsIgnoringCase}.
+ * </li>
+ * <li>Tests all premises underlying the CWJ implementations of {@code
StringUTF16} methods {@code
+ * compareToCI}, {@code regionMatchesCI} and {@code
compareCodePointsIgnoringCase}.
+ * </li>
+ * <li>Converts to separate TestNG unit tests the legacy test code
from the 2016-06-27 commit. (Those
+ * tests provide no additional coverage, so they are preserved only
for completeness.)
+ * </li>
+ * </ul>
+ * <p>All tests operate by invoking local instance methods {@link
#equalsIgnoreCase} and {@link #compareToIgnoreCase},
+ * which may be overridden to apply these tests to any class supplying
equivalent functionality. This has been
+ * useful, for example, while developing and benchmarking optimized
implementations of {@code StringUTF16} methods
+ * {@code regionMatchesCI} and {@code compareToCI} outside the JDK code
base. If doing so, be aware of other
+ * methods intended to be overridden:
+ * </p>
+ * <p><b>General</b></p>
+ * <ul>
+ * <li>{@link #getTestedClassFQN()}</li>
+ * </ul>
+ * <p><b>EqualsIgnoreCase</b></p>
+ * <ul>
+ * <li>{@link #getEqualsIgnoreCaseMethodName()}</li>
+ * <li>{@link #getEqualsIgnoreCaseFormatterString()}</li>
+ * </ul>
+ * <p><b>CompareToIgnoreCase</b></p>
+ * <ul>
+ * <li>{@link #getCompareToIgnoreCaseMethodName()}</li>
+ * <li>{@link #getCompareToIgnoreCaseFormatterString()}</li>
+ * </ul>
+ * <p>The tests will function whether or not those methods are overridden,
+ * but overriding them improves error message readability.
+ * </p>
+ */
+ at SuppressWarnings({ "UseOfSystemOutOrSystemErr",
"DuplicateStringLiteralInspection" })
public class CompareIC {
-
- public static void main(String[] args) throws Exception {
- String test1 = "Tess";
- String test2 = "Test";
- String test3 = "Tesu";
- CompareIC comparer = new CompareIC();
-
- comparer.testTriplet(test1, test2, test3);
+
+ /**
+ * <p>Gets tested class's fully qualified name (FQN) for use in
messages generated by these tests.
+ * </p>
+ * <p>By default, returns "java.lang.String". Intended to be
overridden when this class is used
+ * outside the JDK-proper while, for example, developing and
benchmarking optimized implementations
+ * of methods "{@code regionMatchesCI}" and "{@code compareToCI}" from
"java.lang.StringUTF16".
+ * </p>
+ *
+ * @return fully qualified name of tested class
+ *
+ * @see #getEqualsIgnoreCaseMethodName()
+ * @see #getCompareToIgnoreCaseMethodName()
+ */
+ public String getTestedClassFQN() {
+ return String.class.getName();
+ }
+
+ /**
+ * Gets name of case-insensitive equality method being tested.
+ *
+ * @return case-insensitive equality method's name, for example {@code
"equalsIgnoreCase"}
+ *
+ * @see #getEqualsIgnoreCaseFormatterString()
+ */
+ public String getEqualsIgnoreCaseMethodName() {
+ return "equalsIgnoreCase";
+ }
+
+ /**
+ * <p>Gets format string suitable for use by {@link
Formatter#format(String, Object...)} to create a text
+ * representation of an "equalsIgnoreCase" (or equivalent) invocation.
When this format is used, the {@code
+ * format} method is passed the following parameters:
+ * </p>
+ * <ol>
+ * <li>{@code String} - First compared {@code String}, converted
to string literal format.</li>
+ * <li>{@code int} - {@code char} offset, into first compared
{@code String}, at which comparison was to begin.</li>
+ * <li>{@code int} - Number of {@code char}s used, starting at
offset above, from first compared {@code String}.</li>
+ * <li>{@code String} - Second compared {@code String}, converted
to string literal format.</li>
+ * <li>{@code int} - {@code char} offset, into second compared
{@code String}, at which comparison was to begin.</li>
+ * <li>{@code int} - Number of {@code char}s used, starting at
offset above, from second compared {@code String}.</li>
+ * </ol>
+ * <p>For an invocation of {@link String#equalsIgnoreCase(String)},
the format string would be {@code "%s.equalsIgnoreCase(%4$)"}.
+ * Most of the {@code format} parameters are irrelevant in most cases
(and parameter 3 makes redundant parameter 6 in almost
+ * any case), but these parameters should allow representation of
almost any method invocation.
+ * </p>
+ *
+ * @return {@link Formatter} format string used to create a text
representation of an "equalsIgnoreCase" (or equivalent)
+ * invocation, including all of its parameters
+ *
+ * @see #getEqualsIgnoreCaseMethodName()
+ */
+ public String getEqualsIgnoreCaseFormatterString() {
+ return "%1$s." + getEqualsIgnoreCaseMethodName() + "(%4$s)";
+ }
+
+ /**
+ * Gets name of case-insensitive comparison method being tested.
+ *
+ * @return case-insensitive comparison method's name, for example
{@code "compareToIgnoreCase"}
+ *
+ * @see #getCompareToIgnoreCaseFormatterString()
+ */
+ public String getCompareToIgnoreCaseMethodName() {
+ return "compareToIgnoreCase";
+ }
+
+ /**
+ * <p>Gets format string suitable for use by {@link
Formatter#format(String, Object...)} to create a text
+ * representation of a "compareToIgnoreCase" (or equivalent)
invocation. When this format is used, the {@code
+ * format} method is passed the following parameters:
+ * </p>
+ * <ol>
+ * <li>{@code String} - First compared {@code String}, converted
to string literal format.</li>
+ * <li>{@code int} - {@code char} offset, into first compared
{@code String}, at which comparison was to begin.</li>
+ * <li>{@code int} - Number of {@code char}s used, starting at
offset above, from first compared {@code String}.</li>
+ * <li>{@code String} - Second compared {@code String}, converted
to string literal format.</li>
+ * <li>{@code int} - {@code char} offset, into second compared
{@code String}, at which comparison was to begin.</li>
+ * <li>{@code int} - Number of {@code char}s used, starting at
offset above, from second compared {@code String}.</li>
+ * </ol>
+ * <p>For an invocation of {@link String#compareToIgnoreCase(String)},
the format string would be {@code "%s.compareToIgnoreCase(%4$)"}.
+ * Most of the {@code format} parameters are irrelevant in most cases
(and parameter 3 makes redundant parameter 6 in almost
+ * any case), but these parameters should allow representation of
almost any method invocation.
+ * </p>
+ *
+ * @return {@link Formatter} format string used to create a text
representation of a "compareToIgnoreCase" (or equivalent)
+ * invocation, including all of its parameters
+ *
+ * @see #getCompareToIgnoreCaseMethodName()
+ */
+ public String getCompareToIgnoreCaseFormatterString() {
+ return "%1$s." + getCompareToIgnoreCaseMethodName() + "(%4$s)";
+ }
+
+ /**
+ * <p>Evaluates case-insensitive equality of two strings.
+ * </p>
+ * <p>By default, uses parameters "{@code a}" and "{@code b}" in
invocation "{@code a.equalsIgnoreCase(b)}".
+ * Intended to be overridden when this class is used outside the
JDK-proper while, for example, developing
+ * and benchmarking optimized implementations of "{@code
java.lang.StringUTF16.regionMatchesCI}".
+ * </p>
+ *
+ * @param a one string to test
+ * @param b the other string to test
+ *
+ * @return {@code true} if the strings were equal (exactly or
case-insensitively), {@code false} otherwise
+ *
+ * @see #getEqualsIgnoreCaseMethodName()
+ * @see #getTestedClassFQN()
+ */
+ public boolean equalsIgnoreCase
+ (
+ final String a,
+ final String b
+ ){
+ return a.equalsIgnoreCase(b);
+ }
+
+ /**
+ * <p>Compares two strings case-insensitively.
+ * </p>
+ * <p>By default, uses parameters "{@code a}" and "{@code b}" in
invocation "{@code a.compareToIgnoreCase(b)}".
+ * Intended to be overridden when this class is used outside the
JDK-proper while, for example, developing and
+ * benchmarking optimized implementations of "{@code
java.lang.StringUTF16.compareToCI}".
+ * </p>
+ *
+ * @param a basis of the comparison, as if invoking {@code
a.compareToIgnoreCase(b)}
+ * @param b {@code String} to which {@code a} is compared
+ *
+ * @return negative value when {@code a < b}, zero when {@code a}
equals {@code b},
+ * or positive value when {@code a > b}
+ *
+ * @see #getCompareToIgnoreCaseMethodName()
+ * @see #getTestedClassFQN()
+ */
+ public int compareToIgnoreCase
+ (
+ final String a,
+ final String b
+ ){
+ return a.compareToIgnoreCase(b);
+ }
+
+ /**
+ * Legacy test for <a href="
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8160312">bug
8160312</a>.
+ */
+ @Test
+ public void compareToIgnoreCase_MicroSignGreaterThanX() {
+
+ // Code-point U+00B5 is the "MICRO SIGN" character from Unicode's
ISO-8859-1 range.
+ // Code-point U+0058 is the "LATIN CAPITAL LETTER X" character from
Unicode's US-ASCII range.
+
+ if (compareToIgnoreCase("\u00b5", "X") <= 0)
+ fail("Comparison failure 1 for bug 8160312.");
+ }
+
+ /**
+ * Legacy test for <a href="
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=4124769">bug
4124769</a>.
+ */
+ @SuppressWarnings("StringToUpperCaseOrToLowerCaseWithoutLocale")
+ @Test
+ public void compareToIgnoreCase_Triplets() {
+ final String test1 = "Tess";
+ String test2 = "Test";
+ final String test3 = "Tesu";
+
+ testTriplet(test1, test2, test3);
test2 = test2.toUpperCase();
- comparer.testTriplet(test1, test2, test3);
+ testTriplet(test1, test2, test3);
test2 = test2.toLowerCase();
- comparer.testTriplet(test1, test2, test3);
-
- // toLowerCase -> non-latin1
- if ("\u00b5".compareToIgnoreCase("X") < 0)
- throw new RuntimeException("Comparison failure1");
- }
-
- private void testTriplet(String one, String two, String three)
- throws Exception {
- if (one.compareToIgnoreCase(two) > 0)
- throw new RuntimeException("Comparison failure1");
- if (two.compareToIgnoreCase(three) > 0)
- throw new RuntimeException("Comparison failure2");
- if (three.compareToIgnoreCase(one) < 0)
- throw new RuntimeException("Comparison failure3");
+ testTriplet(test1, test2, test3);
+ }
+
+ /**
+ * Compares a trio of strings for legacy test {@link
#compareToIgnoreCase_Triplets}.
+ *
+ * @param one string used by test comparisons
+ * @param two string used by test comparisons
+ * @param three string used by test comparisons
+ */
+ @SuppressWarnings("SameParameterValue")
+ private void testTriplet(final String one, final String two, final
String three) {
+
+ if (compareToIgnoreCase(one, two) > 0)
+ fail("Comparison failure 1 for bug 4124769.");
+ if (compareToIgnoreCase(two, three) > 0)
+ fail("Comparison failure 2 for bug 4124769.");
+ if (compareToIgnoreCase(three, one) < 0)
+ fail("Comparison failure 3 for bug 4124769.");
+ }
+
+ @Test
+ public void testStringsOfEqualLength() {
+
+ // Note: Full-width latin letters (uppercase U+FF21 to U+FF3A, and
lowercase
+ // U+FF41 to U+FF5A) are case-insensitively equal to each
other, but
+ // not to their ASCII counterparts.
+
+ // Test equal-length strings exhibiting case-insensitive equality.
+
+ testEqualStringsIC("\uFF21", "\uFF41"
); // "A" vs. "a".
+ testEqualStringsIC("\uFF21\uFF22", "\uFF41\uFF42"
); // "AB" vs. "ab".
+ testEqualStringsIC("\uFF21\uFF22\uFF23",
"\uFF41\uFF42\uFF43" ); // "ABC" vs. "abc".
+ testEqualStringsIC("\uFF21\uFF22\uFF23\uFF24",
"\uFF41\uFF42\uFF43\uFF44"); // "ABCD" vs. "abcd".
+ testEqualStringsIC("\uFF21\uFF22\uFF23\uFF24", new
String("\uFF21\uFF22\uFF23\uFF24".toCharArray())); // Test identical
strings ("ABCD") without "String" object reference equality interfering.
+
+ // Test equal-length strings exhibiting case-insensitive inequality.
+
+ testUnequalStringsIC("\uFF21\uFF22cd",
"\uFF41\uFF42\uFF43\uFF44"); // "ABcd" vs. "abcd".
+ }
+
+ @Test
+ public void testStringsOfUnequalLength() {
+
+ // Note: Full-width latin letters (uppercase U+FF21 to U+FF3A, and
lowercase
+ // U+FF41 to U+FF5A) are case-insensitively equal to each
other, but
+ // not to their ASCII counterparts.
+
+ // Test strings whose overlapping portions are equal, so
+ // inequality emerges only from their length difference.
+
+ testUnequalStringsIC("\uFF21\uFF22\uFF23\uFF24",
"\uFF41\uFF42\uFF43\uFF44\uFF45"); // "ABCD" vs. "abcde".
+ testUnequalStringsIC("\uFF41\uFF42\uFF43\uFF44",
"\uFF21\uFF22\uFF23\uFF24\uFF25"); // "abcd" vs. "ABCDE".
+
+
+ // Test strings whose overlapping portions are unequal, so
+ // inequality originates in the comparison of their overlap,
+ // rather than their length difference.
+
+ testUnequalStringsIC("\uFF41\uFF42\uFF43\uFF44\uFF45",
"\uFF21\uFF22\uFF23\uFF38"); // "abcde" vs. "ABCX".
+
+ testUnequalStringsIC("\uFF21\uFF22\uFF23\uFF24",
"\uFF41\uFF42\uFF43\uFF58\uFF59\uFF5A"); // "ABCD" vs. "abcxyz".
+ testUnequalStringsIC("\uFF41\uFF42\uFF43\uFF44",
"\uFF21\uFF22\uFF23\uFF38\uFF39\uFF3A"); // "abcd" vs. "ABCXYZ".
+ }
+
+ @Test
+ public void testUnpairedSurrogates_HighVsHigh() {
+ testUnpairedSurrogates("\uD800", "\uDBFF");
+ }
+
+ @Test
+ public void testUnpairedSurrogates_LowVsLow() {
+ testUnpairedSurrogates("\uDC00", "\uDFFF");
+ }
+
+ @Test
+ public void testUnpairedSurrogates_HighVsLow() {
+ testUnpairedSurrogates("\uD800", "\uDC00");
+ }
+
+ private void testUnpairedSurrogates
+ (
+ final String unpairedSurrogateLesser,
+ final String unpairedSurrogateGreater
+ ){
+ testUnequalStringsIC( unpairedSurrogateLesser ,
unpairedSurrogateGreater );
+ testUnequalStringsIC("0" + unpairedSurrogateLesser , "0" +
unpairedSurrogateGreater );
+ testUnequalStringsIC( unpairedSurrogateLesser + "1",
unpairedSurrogateGreater + "1");
+ testUnequalStringsIC("0" + unpairedSurrogateLesser + "2", "0" +
unpairedSurrogateGreater + "2");
+
+ // Compare unpaired surrogate with greater and lesser 16-bit BMP
code-points.
+
+ testUnequalStringsIC("0" + unpairedSurrogateLesser + "2",
"0\uFF112");
+ testUnequalStringsIC("0\u20812", "0" + unpairedSurrogateLesser +
"2");
+ }
+
+ @Test
+ public void testSurrogatePairs_UnequalHighVsEqualLow() {
+ testUnequalStringsIC("\uD800\uDC00", "\uD801\uDC00");
+ }
+
+ @Test
+ public void testSurrogatePairs_EqualHighVsUnequalLow() {
+ testUnequalStringsIC("\uD800\uDC00", "\uD800\uDC01");
+ }
+
+ @Test
+ public void testSurrogatePairs_EqualHighVsInvalidLow() {
+
+ // One invalid low-surrogate.
+
+ testUnequalStringsIC("\uD800\uBAD1", "\uD800\uDC00");
+
+ // Two invalid low-surrogates.
+
+ testUnequalStringsIC("\uD800\uBAD1", "\uD800\uBAD2");
+ }
+
+ private void testEqualStringsIC
+ (
+ @SuppressWarnings("SameParameterValue")
+ final String stringA, // String case-insensitively
equal to "stringB".
+ final String stringB // String case-insensitively
equal to "stringA".
+ ){
+ try (
+ final Formatter error = new Formatter(new
StringBuilder(0)) // If the test succeeds, this "StringBuilder" remains
zero-length.
+ ){
+
+ // Test comparison, as used in sorting.
+ //
+ // These tests fail if "stringA" is not identified by
+ // "compareToIgnoreCase" as equal to "stringB".
+
+ {
+ int signum;
+
+ signum = compareToIgnoreCase(stringA, stringB);
+ if (signum != 0) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getCompareToIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringA),
+ /* 2 */ 0,
+ /* 3 */ stringA.length(),
+ /* 4 */ StringLiteral.from(stringB),
+ /* 5 */ 0,
+ /* 6 */ stringB.length()
+ ).format(
+ " returned %d instead of zero.",
+ signum
+ );
+ }
+
+ signum = compareToIgnoreCase(stringB, stringA);
+ if (signum != 0) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getCompareToIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringB),
+ /* 2 */ 0,
+ /* 3 */ stringB.length(),
+ /* 4 */ StringLiteral.from(stringA),
+ /* 5 */ 0,
+ /* 6 */ stringA.length()
+ ).format(
+ " returned %d instead of zero.",
+ signum
+ );
+ }
+ }
+
+ // Test equality testing.
+ //
+ // These tests fail if "equalsIgnoreCase" returns "false".
+
+ if (!equalsIgnoreCase(stringA, stringB)) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getEqualsIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringA),
+ /* 2 */ 0,
+ /* 3 */ stringA.length(),
+ /* 4 */ StringLiteral.from(stringB),
+ /* 5 */ 0,
+ /* 6 */ stringB.length()
+ ).format(
+ " returned \"false\" instead of \"true\"."
+ );
+ }
+
+ if (!equalsIgnoreCase(stringB, stringA)) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getEqualsIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringB),
+ /* 2 */ 0,
+ /* 3 */ stringB.length(),
+ /* 4 */ StringLiteral.from(stringA),
+ /* 5 */ 0,
+ /* 6 */ stringA.length()
+ ).format(
+ " returned \"false\" instead of \"true\"."
+ );
+ }
+
+ // If the "StringBuilder" used by "error" is not empty, one or
more tests failed, so invoke "fail".
+
+ error.flush();
+
+ if (!((CharSequence) error.out()).isEmpty()) {
+
+ fail(
+ "testEqualStringsIC(" + StringLiteral.from(stringA) +
+ ", " + StringLiteral.from(stringB) + ") found test
failure(s):" +
+ error
+ );
+ }
+ }
+ }
+
+ private void testUnequalStringsIC
+ (
+ @SuppressWarnings("SameParameterValue")
+ final String stringLesser, // String
case-insensitively less than "stringGreater".
+ final String stringGreater // String
case-insensitively greater than "stringLesser".
+ ){
+ try (
+ final Formatter error = new Formatter(new
StringBuilder(0)) // If the test succeeds, this "StringBuilder" remains
zero-length.
+ ){
+
+ // Test comparison, as used in sorting.
+ //
+ // These tests fail if "stringLesser" is not identified by
+ // "compareToIgnoreCase" as less than "stringGreater".
+
+ {
+ int signum;
+
+ signum = compareToIgnoreCase(stringLesser, stringGreater);
+ if (signum >= 0) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getCompareToIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringLesser),
+ /* 2 */ 0,
+ /* 3 */ stringLesser.length(),
+ /* 4 */ StringLiteral.from(stringGreater),
+ /* 5 */ 0,
+ /* 6 */ stringGreater.length()
+ ).format(
+ " returned %d instead of a negative value.",
+ signum
+ );
+ }
+
+ signum = compareToIgnoreCase(stringGreater, stringLesser);
+ if (signum <= 0) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getCompareToIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringGreater),
+ /* 2 */ 0,
+ /* 3 */ stringGreater.length(),
+ /* 4 */ StringLiteral.from(stringLesser),
+ /* 5 */ 0,
+ /* 6 */ stringLesser.length()
+ ).format(
+ " returned %d instead of a positive value.",
+ signum
+ );
+ }
+ }
+
+ // Test equality testing.
+ //
+ // (As this method's name makes clear, its inputs must be
case-insensitively *unequal*,
+ // so these tests fail if "equalsIgnoreCase" returns "true".)
+
+ if (equalsIgnoreCase(stringLesser, stringGreater)) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getEqualsIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringLesser),
+ /* 2 */ 0,
+ /* 3 */ stringLesser.length(),
+ /* 4 */ StringLiteral.from(stringGreater),
+ /* 5 */ 0,
+ /* 6 */ stringGreater.length()
+ ).format(
+ " returned \"true\" instead of \"false\"."
+ );
+ }
+
+ if (equalsIgnoreCase(stringGreater, stringLesser)) {
+
+ error.format(
+ "%n * Method %s.",
+ getTestedClassFQN()
+ ).format(
+ getEqualsIgnoreCaseFormatterString(),
+ /* 1 */ StringLiteral.from(stringGreater),
+ /* 2 */ 0,
+ /* 3 */ stringGreater.length(),
+ /* 4 */ StringLiteral.from(stringLesser),
+ /* 5 */ 0,
+ /* 6 */ stringLesser.length()
+ ).format(
+ " returned \"true\" instead of \"false\"."
+ );
+ }
+
+ // If the "StringBuilder" used by "error" is not empty, one or
more tests failed, so invoke "fail".
+
+ error.flush();
+
+ if (!((CharSequence) error.out()).isEmpty()) {
+
+ fail(
+ "testUnequalStringsIC(" +
StringLiteral.from(stringLesser) +
+ ", " + StringLiteral.from(stringGreater) + ") found
test failure(s):" +
+ error
+ );
+ }
+ }
+ }
+
+ @Test
+ public void
allCodePointsDifferingOnlyInCaseAreEqual_equalsIgnoreCase() {
+
+ // If the test
"validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints",
+ // or
"validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate",
+ // reports, for example:
+ //
+ // "Of 1,048,576 SMP code-points, 450 are assigned, public and
affected
+ // by case-conversion."
+ //
+ // ...then expect a total failure of this test to report 225 errors
(450/2),
+ // because errors unaffected by parameter order are reported only on
their
+ // first occurrence (assuming the test's code-point loop operates in
ascending
+ // order).
+ //
+ // That expectation would not apply if a case conversion error
occurred among
+ // the Basic Multilingual Plane code-points, but that has not been an
issue,
+ // historically.
+
+ testAllAssignedPublicCodePointsForCaseInsensitiveEquality(
+ getTestedClassFQN(),
+ getEqualsIgnoreCaseMethodName(),
+ getEqualsIgnoreCaseFormatterString() + " returned \"%10$s\",
not \"true\"",
+ this::equalsIgnoreCase,
+ // No result conversion needed. If "equalsIgnoreCase" works as
expected,
+ // "true" is returned. If "false" is returned, something went
wrong.
+ r -> r
+ );
+ }
+
+ @Test
+ public void
allCodePointsDifferingOnlyInCaseAreEqual_compareToIgnoreCase() {
+
+ // If the test
"validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints",
+ // or
"validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate",
+ // reports, for example:
+ //
+ // "Of 1,048,576 SMP code-points, 450 are assigned, public and
affected
+ // by case-conversion."
+ //
+ // ...then expect a total failure of this test to report 225 errors
(450/2),
+ // because errors unaffected by parameter order are reported only on
their
+ // first occurrence (assuming the test's code-point loop operates in
ascending
+ // order).
+ //
+ // That expectation would not apply if a case conversion error
occurred among
+ // the Basic Multilingual Plane code-points, but that has not been an
issue,
+ // historically.
+
+ testAllAssignedPublicCodePointsForCaseInsensitiveEquality(
+ getTestedClassFQN(),
+ getCompareToIgnoreCaseMethodName(),
+ getCompareToIgnoreCaseFormatterString() + " returned %10$s,
not 0",
+ this::compareToIgnoreCase,
+ // Convert the result to a boolean. If "compareToIgnoreCase"
works as expected,
+ // zero is returned, and this function converts it to "true". If
a non-zero value
+ // is returned, something went wrong, and this function converts
it to "false".
+ r -> r == 0
+ );
+ }
+
+ private static <R> void
testAllAssignedPublicCodePointsForCaseInsensitiveEquality
+ (
+ final String testedClassName,
+ final String testedMethodName,
+ final String
testMethodInvocationFailureFormat,
+ final BiFunction<String,String,R> testFunction,
+ final Predicate<R> testResultValidator
+ ){
+ final String toUcFormat = "%n U+%7$04X -
Character.toUpperCase(0x%8$04X) returned code-point U+%9$04X, but " +
testMethodInvocationFailureFormat + '.';
+ final String toLcFormat = "%n U+%7$04X -
Character.toLowerCase(0x%8$04X) returned code-point U+%9$04X, but " +
testMethodInvocationFailureFormat + '.';
+ final String toLcToUcFormat = "%n U+%7$04X -
Character.toLowerCase(Character.toUpperCase(0x%8$04X)) returned code-point
U+%9$04X, but " + testMethodInvocationFailureFormat + '.';
+
+ try (
+ final Formatter error = new Formatter(new
StringBuilder(0)) // If the test succeeds, this "StringBuilder" remains
zero-length.
+ ){
+ int failures = 0;
+
+ // Examine *every* Unicode code-point.
+
+ for (int codePoint = Character.MIN_CODE_POINT; codePoint <=
Character.MAX_CODE_POINT; codePoint++) {
+ final int codePointType =
Character.getType(codePoint);
+
+ // Skip this code-point/-unit, if case is irrelevant to its
type (Unicode category).
+
+ if (codePointType == Character.UNASSIGNED || codePointType
== Character.PRIVATE_USE || codePointType == Character.SURROGATE)
+ continue;
+
+ // Replicate both conversions potentially performed by
case-insensitive comparison:
+ // 1. Convert the code-point to uppercase.
+ // 2. Convert the uppercase code-point to lowercase.
+
+ final int codePointUc =
Character.toUpperCase(codePoint );
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ // Skip this code-point, if neither case-conversion affected
it.
+
+ if (codePoint == codePointUc && codePointUc == codePointLc)
+ continue;
+
+ // KNOWN: "codePoint" is not equal to one or both of
"codePointUc" and "codePointLc".
+ // Therefore, "codePoint" has upper- and/or lower-case
counterparts, and is
+ // suitable for the following case-insensitive
"String" equality validation tests.
+
+ if (codePoint != codePointUc) {
+
+ // KNOWN: "codePoint" is lowercase, because
"toUpperCase(codePoint)" returned a different code-point.
+ // Therefore test "codePoint" vs. "codePointUc".
+
+ if
(!testMutualEqualityWithoutRedundantErrors(codePoint, testFunction,
testResultValidator, codePoint, codePointUc, error, toUcFormat))
+ failures++;
+
+ if (codePoint != codePointLc) {
+
+ // Triple-case code-point: There must be three
distinct forms of "codePoint" (including itself).
+ // This is very rare, and all examples are Basic
Multilingual Plane code-points, as of JDK 15.0.2
+ // and Unicode 13.0.0.
+
+ if
(!testMutualEqualityWithoutRedundantErrors(codePoint, testFunction,
testResultValidator, codePointUc, codePointLc, error, toLcToUcFormat))
+ failures++;
+ }
+
+ } else { // codePoint == codePointUc && codePointUc !=
codePointLc
+
+ // KNOWN: "codePoint" is uppercase, because
"toUpperCase(codePoint)" returned it.
+ // Therefore, test "codePoint" vs. "codePointLc".
+
+ if
(!testMutualEqualityWithoutRedundantErrors(codePoint, testFunction,
testResultValidator, codePoint, codePointLc, error, toLcFormat))
+ failures++;
+ }
+ }
+
+ if (failures != 0) {
+
+ error.format("%nTotal erroneous case-insensitive
code-point comparisons: %,d.", failures).flush();
+ fail("Method \"" + testedClassName + '.' +
testedMethodName + "\" erroneously reported differences between
case-insensitively equal code-points:" + error.out());
+ }
+ }
+ }
+
+ private static <R> boolean testMutualEqualityWithoutRedundantErrors
+ (
+ final int testCodePoint, //
Code-point currently being tested by the caller's code-point loop, which
must operate in ascending order.
+ final BiFunction<String,String,R> comparisonFunction, //
Case-insensitive comparison function invoked with "String" representations
of "codePointA" and "codePointB" as its parameters (in that order, and
reversed).
+ final Predicate<R> resultIsEquality, //
Function to examine "comparisonFunction" result, returning "true" if it
indicates equality, or "false" otherwise.
+ final int codePointA, //
Code-point for equality test.
+ final int codePointB, //
Code-point for equality test.
+ final Formatter out, //
"Formatter" to which an error report, if any, will be written.
+ final String failureFormat //
"Formatter" format "String". Parameters: (1) "testCodePoint" or
"codePointA" in String literal format, (2) offset into "String" of
parameter 1 code-point [always 0],
+ // (3)
length of "String" of parameter 1 code-point [1 or 2], (4) "codePointB" in
String literal format, (5) offset into "String" of "codePointB" [always 0],
+ ){ // (6)
length of "String" of "codePointB" [1 or 2], (7) "testCodePoint", (8)
parameter 1 code-point as "int", (9) "codePointB", and (10)
"comparisonFunction" return value.
+ if (codePointA != codePointB) {
+ final String codePointAStr =
Character.toString(codePointA);
+ final String codePointBStr =
Character.toString(codePointB);
+ final R compareResult =
comparisonFunction.apply(codePointAStr, codePointBStr);
+
+ if (!resultIsEquality.test(compareResult)) { // If
"resultIsEquality" judged that "compareResult" indicates inequality...
+
+ // Failure. However, report the failure only if (1) this
appears to be its first occurrence
+ // (assuming the caller is testing code-points in ascending
order), or (2) "testCodePoint" is
+ // a rare triple-case code-point, or (3) performing the same
test with parameters reversed
+ // produces an inconsistent result.
+
+ if (testCodePoint <= codePointA && testCodePoint <=
codePointB // This is the first occurrence of this
particular failure, because the caller's ascending code-point loop has
reached neither "codePointA" nor "codePointB".
+ || testCodePoint != codePointA && testCodePoint !=
codePointB // "testCodePoint" must be one of the very
rare triple-case code-points. Always report errors associated with it.
+ ||
resultIsEquality.test(comparisonFunction.apply(codePointBStr,
codePointAStr)) // Reversing the "comparisonFunction" parameters produced
a different result. Report the lack of mutual equality. (This could be
redundant.)
+ ){
+ // Either this is the failure's first occurrence, or
"testCodePoint" is a triple-case code-
+ // point, or the parameter-reversed test returned a
result different from the first test.
+
+ if (testCodePoint == codePointA || testCodePoint ==
codePointB) {
+ out.format(
+ failureFormat, // This is usually the
caller's "toLc" or "toUc" format. For example, respectively,
"Character.toLowerCase(0x%8$04X) == U+%9$04X" or
"Character.toUpperCase(0x%8$04X) == U+%9$04X".
+ /* 1 */ StringLiteral.from(codePointA),
+ /* 2 */ 0,
+ /* 3 */ codePointLength(codePointA),
+ /* 4 */ StringLiteral.from(codePointB),
+ /* 5 */ 0,
+ /* 6 */ codePointLength(codePointB),
+ /* 7 */ testCodePoint,
+ /* 8 */ codePointA,
+ /* 9 */ codePointB,
+ /* 10 */ compareResult
+ );
+ } else { // if (testCodePoint != codePointA &&
testCodePoint != codePointB)...
+ out.format(
+ failureFormat, // This should be the caller's
"toLcToUc" format. For example,
"Character.toLowerCase(Character.toUpperCase(0x%8$04X)) == U+%9$04X".
+ /* 1 */ StringLiteral.from(testCodePoint),
+ /* 2 */ 0,
+ /* 3 */ codePointLength(testCodePoint),
+ /* 4 */ StringLiteral.from(codePointB),
+ /* 5 */ 0,
+ /* 6 */ codePointLength(codePointB),
+ /* 7 */ testCodePoint,
+ /* 8 */ testCodePoint,
+ /* 9 */ codePointB,
+ /* 10 */ compareResult
+ );
+ }
+
+ return false; // The test failed, and this should be
the error's first occurrence.
+ }
+ }
+ }
+
+ // The test was either unnecessary ("codePointA == codePointB"), or
successful, or deemed
+ // a redundant failure to be ignored.
+
+ return true;
+ }
+
+ @Test
+ public void allTripleCaseCodePointsAreEqual_equalsIgnoreCase() {
+
+ testTripleCaseCodePointEquality(
+ getTestedClassFQN(),
+ "equalsIgnoreCase",
+ getEqualsIgnoreCaseFormatterString() + " returned \"%10$s\",
not \"true\"",
+ this::equalsIgnoreCase,
+ r -> r // "true" (equality) is the correct answer, so return
"true" when "r" is "true", and "false" when "r" is "false".
+ );
+ }
+
+ @Test
+ public void allTripleCaseCodePointsAreEqual_compareToIgnoreCase() {
+
+ testTripleCaseCodePointEquality(
+ getTestedClassFQN(),
+ "compareToIgnoreCase",
+ getCompareToIgnoreCaseFormatterString() + " returned %10$s,
not 0",
+ this::compareToIgnoreCase,
+ r -> r == 0 // Zero (equality) is the correct answer. Return
"true" when "r" is zero.
+ );
+ }
+
+ private static <R> void testTripleCaseCodePointEquality
+ (
+ final String testedClassName,
+ final String testedMethodName,
+ final String testedMethodFormatterString,
+ final BiFunction<String,String,R> testFunction,
+ final Predicate<R> testResultValidator
+ ){
+ try (
+ final Formatter error = new Formatter(new
StringBuilder(0)) // If the test succeeds, this "StringBuilder" remains
zero-length.
+ ){
+ // Test each of the triple-case code-points. (Other test methods
in this class test
+ // these code-points, but they do not single-out these
code-points, or supply a bug's
+ // probable fix.)
+
+ final String toUcFormat = " U+%7$04X -
Character.toUpperCase(0x%8$04X) returned code-point U+%9$04X, but " +
testedMethodFormatterString + ".%n";
+ final String toLcFormat = " U+%7$04X -
Character.toLowerCase(0x%8$04X) returned code-point U+%9$04X, but " +
testedMethodFormatterString + ".%n";
+ final String toLcToUcFormatCpLc = " U+%7$04X -
Character.toLowerCase(Character.toUpperCase(0x%<04X)) returned code-point
U+%9$04X, but " + testedMethodFormatterString + ".%n";
+ final String toLcToUcFormatLcCp = " U+%7$04X -
Character.toLowerCase(Character.toUpperCase(0x%<04X)) returned code-point
U+%8$04X, but " + testedMethodFormatterString + ".%n";
+ final int[] tripleCaseCodePoints =
getTripleCaseCodePoints();
+ int failures = 0;
+
+ for (final int codePoint : tripleCaseCodePoints) {
+ final int codePointUc =
Character.toUpperCase(codePoint );
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ if (!testMutualEqualityWithoutRedundantErrors(codePoint,
testFunction, testResultValidator, codePoint, codePointUc, error,
toUcFormat))
+ failures++;
+ if (!testMutualEqualityWithoutRedundantErrors(codePoint,
testFunction, testResultValidator, codePointUc, codePointLc, error,
toLcFormat))
+ failures++;
+ if (!testMutualEqualityWithoutRedundantErrors(codePoint,
testFunction, testResultValidator, codePoint, codePointLc, error,
toLcToUcFormatCpLc))
+ failures++;
+
+ // Repeat the preceding test with the code-points reversed.
The repetition is necessary to
+ // achieve full coverage of any implementation with a
"compareCodePointsIgnoringCase" method
+ // using a pair of "switch" expressions (which should contain
identical "case" clauses).
+
+ if (!testMutualEqualityWithoutRedundantErrors(codePoint,
testFunction, testResultValidator, codePointLc, codePoint, error,
toLcToUcFormatLcCp))
+ failures++;
+ }
+
+ if (failures != 0) {
+
+ error.format("%nReplacement for \"switch\" expression in
\"StringUTF16.getCIComparisonCodePoint\":%n%n");
+ generateTripleCaseCodePointsSwitchExpression(error,
"Code-Point Progression by Name", Character::getName, tripleCaseCodePoints);
+ // generateTripleCaseCodePointsSwitchExpression(error,
"Unicode Block Progression", Character.UnicodeBlock::of,
tripleCaseCodePoints);
+
+ error.flush();
+ fail(
+ """
+ Triple-case code-point case-insensitive equality test
failed by "%s.%s".%n\
+ TRY: Replace each "switch" expression in
"%1$s.getCIComparisonCodePoint"%n\
+ with the new expression at this message's end.%n\
+ %n\
+ Individual Failures:%n\
+ %s\
+ """.formatted(
+ testedClassName,
+ testedMethodName,
+ error.out()
+ )
+ );
+ }
+ }
+ }
+
+ private static int[] getTripleCaseCodePoints() {
+ @SuppressWarnings("MagicNumber")
+ int[] tripleCaseCodePoints = new int[27]; //
27 was the correct number of triple-case code-points as of JDK 15.0.2 and
Unicode 13.0, but this array will grow, if necessary.
+ int tripleCaseCodePointsIdx = 0;
+
+ for (int codePoint = Character.MIN_CODE_POINT; codePoint <=
Character.MAX_CODE_POINT; codePoint++) {
+ final int codePointUc =
Character.toUpperCase(codePoint);
+
+ if (codePoint == codePointUc)
+ continue;
+
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ if (codePointUc == codePointLc || codePoint == codePointLc)
+ continue;
+
+ // "codePoint" has, in effect, three cases (instead of just
uppercase and lowercase).
+
+ // Increase size of array "tripleCaseCodePoints", if necessary.
+
+ if (tripleCaseCodePoints.length == tripleCaseCodePointsIdx) {
+ //noinspection ObjectAllocationInLoop
+ tripleCaseCodePoints = Arrays.copyOf(tripleCaseCodePoints,
tripleCaseCodePoints.length << 1);
+ }
+
+ // Add "codePoint" to array "tripleCaseCodePoints".
+
+ tripleCaseCodePoints[tripleCaseCodePointsIdx++] = codePoint;
+ }
+
+ return tripleCaseCodePoints.length == tripleCaseCodePointsIdx ?
tripleCaseCodePoints : Arrays.copyOf(tripleCaseCodePoints,
tripleCaseCodePointsIdx);
}
-
+
+ /**
+ * Generates the triple-case code-point "switch" expression used in
method "{@code
+ * java.lang.StringUTF16.getCIComparisonCodePoint}", allowing
expression generation
+ * independent of the these tests or their outcomes.
+ *
+ * @return source code for "switch" expression in {@code
StringUTF16.getCIComparisonCodePoint}"},
+ * generated based on the operative JDK's Unicode data
+ */
+ public static String generateTripleCaseCodePointsSwitchExpression() {
+
+ try (
+ final Formatter out = new Formatter(new
StringBuilder(5 * 1_024))
+ ){
+ generateTripleCaseCodePointsSwitchExpression(out, "Code-Point
Progression by Name", Character::getName, getTripleCaseCodePoints());
+ // generateTripleCaseCodePointsSwitchExpression(out, "Unicode
Block Progression", Character.UnicodeBlock::of, getTripleCaseCodePoints());
+
+ out.flush();
+ return out.out().toString();
+ }
+ }
+
+ /**
+ * Generates the triple-case code-point "switch" expression used in
method "{@code
+ * java.lang.StringUTF16.getCIComparisonCodePoint}", writing it to the
supplied {@link Formatter}.
+ *
+ * @param out {@code Formatter} to which the
"switch" expression is written
+ * @param symbolicProgressionTitle title of column representing
code-point progression using names
+ * (instead of code-point numbers)
+ * @param symbolicNameAccessor function accepting a code-point and
returning an object whose
+ * {@code toString} method provides a
name relevant to the code-point
+ * @param tripleCaseCodePoints array containing every triple-case
code-point, sorted in ascending
+ * order. Normally supplied by {@link
#getTripleCaseCodePoints()}.
+ */
+ private static void generateTripleCaseCodePointsSwitchExpression
+ (
+ final Formatter out,
+ @SuppressWarnings("SameParameterValue")
+ final String symbolicProgressionTitle,
+ final IntFunction<Object> symbolicNameAccessor,
+ final int[] tripleCaseCodePoints // Sorted in
ascending order.
+ ){
+ final boolean bmpOnly =
tripleCaseCodePoints[tripleCaseCodePoints.length - 1] >>> Character.SIZE ==
0; // Is the greatest code-point within the Basic Multilingual Plane?
+ final String switchExpressionLine =
"switch (codePoint) {";
+ final String columnCPPTitle =
"Code-Point Progression";
+ final int columnCPPCodePoints;
// Width of "Code-Point Progression" column in code-points.
+ final String columnCPPLine;
// Column header/footer line, a sequence of HYPHEN-MINUS (U+002D)
characters.
+ final String columnGap = "
"; // Six spaces.
+ final int caseCodePoints = 30;
// Width of "case" clauses in code-points. There are two formats: "
case '\u0000' -> '\u0000';" and " case 0x000000 -> 0x000000;". Both
happen to be the same length.
+ final int commentAbsoluteIndentCodePoints =
Math.max(codePointCount(switchExpressionLine), caseCodePoints) + 4; //
Indent, from left margin, of the comments documenting code-point and
Unicode block progressions.
+ final String commentAbsoluteIndent = "
".repeat(commentAbsoluteIndentCodePoints);
+ final String caseCommentGap = "
".repeat(commentAbsoluteIndentCodePoints - caseCodePoints); // Gap between
"case" semicolon and line-comment start.
+ final String caseFormat;
+ final TripleCaseCodePoints tccpInfo = new
TripleCaseCodePoints(tripleCaseCodePoints, symbolicNameAccessor);
+ final int columnSNPCodePoints =
tccpInfo.name3cCodePointsMax + 4 + tccpInfo.nameUcCodePointsMax + 4 +
tccpInfo.nameLcCodePointsMax; // Width of symbolic name progression (SNP)
column in code-points.
+ final String columnSNPLine =
"-".repeat(columnSNPCodePoints); // Column header/footer line, a sequence
of HYPHEN-MINUS (U+002D) characters.
+
+ if (bmpOnly) { // Code-points from Basic Multilingual Plane (BMP)
only.
+
+ //noinspection StringConcatenationMissingWhitespace
+ caseFormat = " case '\\u%1$04X' ->
'\\u%3$04X';%4$s// U+%1$04X -> U+%2$04X -> U+%3$04X%5$s%6$-" +
tccpInfo.name3cCodePointsMax + "s -> %7$-" + tccpInfo.nameUcCodePointsMax +
"s -> %8$s%n";
+ columnCPPCodePoints = 2 + 4 + 4 + 2 + 4 + 4 + 2 + 4; // For
example, "U+00B5 -> U+039C -> U+03BC".
+
+ } else { // Code-points from both BMP and Supplementary
Multilingual Plane.
+
+ //noinspection StringConcatenationMissingWhitespace
+ caseFormat = " case 0x%1$06X -> 0x%3$06X;%4$s//
U+%1$06X -> U+%2$06X -> U+%3$06X%5$s%6$-" + tccpInfo.name3cCodePointsMax +
"s -> %7$-" + tccpInfo.nameUcCodePointsMax + "s -> %8$s%n";
+ columnCPPCodePoints = 2 + 6 + 4 + 2 + 6 + 4 + 2 + 6; // For
example, "U+0000B5 -> U+00039C -> U+0003BC".
+ }
+
+ columnCPPLine = "-".repeat(columnCPPCodePoints);
+
+ out.format( // Comment line: title.
+ "%s// Triple-Case Code-Points, as of Java %s. (Written by
\"%s.generateTripleCaseCodePointsSwitchExpression\".)%n",
+ commentAbsoluteIndent,
+ System.getProperty("java.version"),
+ CompareIC.class.getName()
+ );
+ out.format("%s// %n", commentAbsoluteIndent);
+ out.format( // Comment line: column titles.
+ "%s// %s%s%s%s%n",
+ commentAbsoluteIndent,
+ columnCPPTitle,
+ " ".repeat(columnCPPCodePoints -
codePointCount(columnCPPTitle)),
+ columnGap,
+ symbolicProgressionTitle
+ );
+ out.format( // Comment line: column horizontal lines.
+ "%s%s// %s%s%s%n",
+ switchExpressionLine,
+ " ".repeat(commentAbsoluteIndentCodePoints -
codePointCount(switchExpressionLine)),
+ columnCPPLine,
+ columnGap,
+ columnSNPLine
+ );
+
+ for (final TripleCaseCodePoint tccpDatum : tccpInfo.tccpData) {
+
+ out.format(
+ caseFormat,
+ /* 1 */ tccpDatum.codePoint3c,
+ /* 2 */ tccpDatum.codePointUc,
+ /* 3 */ tccpDatum.codePointLc,
+ /* 4 */ caseCommentGap,
+ /* 5 */ columnGap,
+ /* 6 */ tccpDatum.codePoint3cName,
+ /* 7 */ tccpDatum.codePointUcName,
+ /* 8 */ tccpDatum.codePointLcName
+ );
+ }
+
+ out.format("%s// %s%s%s%n", commentAbsoluteIndent, columnCPPLine,
columnGap, columnSNPLine); // Comment line: column horizontal lines.
+ out.format(
+ """
+ // All other case-sensitive code-points are either uppercase
(in which case they are changed%n\
+ // below), or lowercase already (in which case they are not).
Therefore, only "toLowerCase"%n\
+ // is necessary. Code-units and case-insensitive code-points
are unchanged by "toLowerCase".%n\
+ default -> Character.toLowerCase(codePoint);%n\
+ };%n\
+ """
+ );
+ }
+
+ /**
+ * <p>Test premise: "Case-insensitive equality exists only among
Supplementary Multilingual Plane (SMP)
+ * code-points UTF-16 encoded using the same high-surrogate
code-unit." This premise, known as "Unicode Empirical
+ * Property no. 1" in {@link java.lang.StringUTF16} (see {@link
java.lang.StringUTF16#regionMatchesCI
+ * regionMatchesCI} implementation notes), is valid for Unicode
versions up to (at least) 13.0.0. It allows an
+ * optimization of UTF-16 case-insensitive comparison: Surrogate pairs
using different high-surrogates prove
+ * inequality, therefore the comparison may terminate without
performing either UTF-16 decoding, or
+ * case-insensitive code-point comparison. (In the case of {@link
java.lang.StringUTF16#compareToCI}, returning
+ * the difference between the high-surrogates is correct.)
+ * </p>
+ * <p>If this premise ceases to be valid (in other words, if this test
fails), methods {@code java.lang.StringUTF16#regionMatchesCI}
+ * and {@link java.lang.StringUTF16#compareToCI}) will require
de-optimization: They must test for case-insensitive
+ * equality between all SMP code-points.
+ * </p>
+ * <p>That optimization cannot be guaranteed safe for Java code in
general, because the code could, someday, run in
+ * the JVM of a Java version whose {@link Character} class represents
a Unicode version with a case-sensitive SMP
+ * code-point arrangement invalidating this premise. Conversely, the
optimization is guaranteed safe for the code of
+ * each Java version passing this test, because that code can
encounter only the Unicode version built into its
+ * {@code Character} class. While this premise holds true, this
optimization allows code outside the JDK to benefit
+ * from the JDK's unique certainty about the operative Unicode version.
+ * </p>
+ *
+ * @see #validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints()
+ * @see #validatePremise_AllBMPCodePointCaseVariantsAreBMPCodePoints()
+ */
+ @SuppressWarnings("JavadocReference")
+ @Test
+ public void
validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate() {
+
+ System.out.println();
+ System.out.printf(
+ "PREMISE: Case-insensitive equality exists only among
Supplementary Multilingual Plane (SMP) code-points UTF-16%n" +
+ "encoded using the same high-surrogate code-unit.%n"
+ );
+
+ int invalidatingCases = 0;
+ int caseSensitiveCodePoints = 0;
+
+ for (int codePoint = Character.MIN_SUPPLEMENTARY_CODE_POINT;
codePoint <= Character.MAX_CODE_POINT; codePoint++) {
+ final int codePointType =
Character.getType(codePoint);
+
+ if (codePointType == Character.UNASSIGNED || codePointType ==
Character.PRIVATE_USE)
+ continue;
+
+ final int codePointUc =
Character.toUpperCase(codePoint );
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ if (codePoint == codePointUc && codePointUc == codePointLc)
+ continue;
+
+ caseSensitiveCodePoints++;
+
+ if (codePoint >>> 10 != codePointUc >>> 10) {
+
+ System.out.printf(
+ " > INVALIDATION: toUpperCase(U+%06X) == U+%06X.
Their high-surrogates differ: U+%04X != U+%04X.%n",
+ codePoint,
+ codePointUc,
+ Character.MIN_HIGH_SURROGATE | (codePoint -
Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10),
+ Character.MIN_HIGH_SURROGATE | (codePointUc -
Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)
+ );
+
+ invalidatingCases++;
+ }
+
+ if (codePointUc >>> 10 != codePointLc >>> 10) {
+
+ System.out.printf(
+ " > INVALIDATION: toLowerCase(U+%06X) == U+%06X.
Their high-surrogates differ: U+%04X != U+%04X.%n",
+ codePointUc,
+ codePointLc,
+ Character.MIN_HIGH_SURROGATE | (codePointUc -
Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10),
+ Character.MIN_HIGH_SURROGATE | (codePointLc -
Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)
+ );
+
+ invalidatingCases++;
+ }
+ }
+
+ System.out.printf(
+ " * Of %,d SMP code-points, %,d are assigned, public and
affected by case-conversion.%n" +
+ " * Tested case variants of those %<,d SMP code-points for
unequal high-surrogates.%n" +
+ " * Found %s.%n",
+ Character.MAX_CODE_POINT -
Character.MIN_SUPPLEMENTARY_CODE_POINT + 1,
+ caseSensitiveCodePoints,
+ invalidatingCases == 0 ? "none" :
"%,d".formatted(invalidatingCases)
+ );
+
+ if (invalidatingCases != 0)
+ fail(String.format("TEST RESULT: Premise is false, because %,d
case variants use different high-surrogates.", invalidatingCases));
+
+ System.out.printf(
+ "TEST RESULT: Premise is valid for operative Unicode version,
because all SMP code-point case variants used%n" +
+ " common high-surrogates.%n%n"
+ );
+ }
+
+ /**
+ * <p>Test premise: "For each Supplementary Multilingual Plane (SMP)
code-point, {@link Character#toUpperCase(int)}
+ * and {@code Character.toLowerCase(Character.toUpperCase(int))}
return a SMP code-point." In {@code
+ * java.lang.StringUTF16}, this premise is referred to as "Unicode
empirical property no. 2".
+ * </p>
+ * <p>Someday, {@link Character} might provide a version of Unicode
(greater than 13.0) invalidating that premise.
+ * If so, it will break all case-insensitive Unicode equality
comparisons requiring equal-length inputs (measured in
+ * "{@code char}" primitives, instead of code-points). For example,
{@link String#equalsIgnoreCase} will break.
+ * </p>
+ * <p>However, even after resolving all aspects of the length issue
everywhere it hides, methods {@code
+ * java.lang.StringUTF16#compareToCI} and {@code
java.lang.StringUTF16#regionMatchesCI} would still be broken,
+ * because this premise, and {@linkplain
#validatePremise_AllBMPCodePointCaseVariantsAreBMPCodePoints() another},
+ * are the bases of an optimization: unequal code-points are tested
for case-insensitive equality only when both
+ * belong to either the Basic Multilingual Plane, or the SMP. (In the
SMP case, a further optimization relies on
+ * a {@linkplain
#validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate()
further premise} tested by
+ * this class.)
+ * </p>
+ *
+ * @see #validatePremise_AllBMPCodePointCaseVariantsAreBMPCodePoints
+ * @see
#validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate
+ */
+ @Test
+ public void
validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints() {
+
+ System.out.println();
+ System.out.println("PREMISE: All Supplementary Multilingual Plane
(SMP) code-points are case-converted to SMP code-points.");
+
+ int invalidatingCases = 0;
+ int caseSensitiveCodePoints = 0;
+
+ for (int codePoint = Character.MIN_SUPPLEMENTARY_CODE_POINT;
codePoint <= Character.MAX_CODE_POINT; codePoint++) {
+ final int codePointType =
Character.getType(codePoint);
+
+ if (codePointType == Character.UNASSIGNED || codePointType ==
Character.PRIVATE_USE)
+ continue;
+
+ final int codePointUc =
Character.toUpperCase(codePoint );
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ if (codePoint == codePointUc && codePointUc == codePointLc)
+ continue;
+
+ caseSensitiveCodePoints++;
+
+ if (codePointUc < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+
+ System.out.printf(" > INVALIDATION: toUpperCase(0x%06X)
== 0x%04X.%n", codePoint, codePointUc);
+ invalidatingCases++;
+
+ } else if (codePointLc <
Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+
+ System.out.printf(" > INVALIDATION:
toLowerCase(toUpperCase(0x%06X)) == 0x%04X.%n", codePoint, codePointLc);
+ invalidatingCases++;
+ }
+ }
+
+ System.out.printf(
+ " * Of %,d SMP code-points, %,d are assigned, public and
affected by case-conversion.%n" +
+ " * Tested case variants of those %<,d SMP code-points for
Basic Multilingual Plane (BMP) code-points.%n" +
+ " * Found %s.%n",
+ Character.MAX_CODE_POINT -
Character.MIN_SUPPLEMENTARY_CODE_POINT + 1,
+ caseSensitiveCodePoints,
+ invalidatingCases == 0 ? "none" :
"%,d".formatted(invalidatingCases)
+ );
+
+ if (invalidatingCases != 0)
+ fail(String.format("TEST RESULT: Premise is false, because %,d
SMP code-points were case-converted to BMP code-points.",
invalidatingCases));
+
+ System.out.printf("TEST RESULT: Premise is valid for operative
Unicode version, because all case variants were also SMP code-points.%n%n");
+ }
+
+ /**
+ * <p>Test premise: "For each Basic Multilingual Plane (BMP)
code-point, {@link Character#toUpperCase(int)}
+ * and {@code Character.toLowerCase(Character.toUpperCase(int))}
return a BMP code-point." In {@code
+ * java.lang.StringUTF16}, this premise is referred to as "Unicode
empirical property no. 3".
+ * </p>
+ * <p>Someday, {@link Character} might provide a version of Unicode
(greater than 13.0) invalidating that premise.
+ * If so, it will break all case-insensitive Unicode equality
comparisons requiring equal-length inputs (measured in
+ * "{@code char}" primitives, instead of code-points). For example,
{@link String#equalsIgnoreCase} will break.
+ * </p>
+ * <p>However, even after resolving all aspects of the length issue
everywhere it hides, methods {@code
+ * java.lang.StringUTF16#compareToCI} and {@code
java.lang.StringUTF16#regionMatchesCI} would still be broken,
+ * because this premise, and {@linkplain
#validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints() another},
+ * are the bases of an optimization: unequal code-points are tested
for case-insensitive equality only when both
+ * belong to either the BMP, or the Supplementary Multilingual Plane
(SMP). (In the SMP case, a further optimization
+ * relies on a {@linkplain
#validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate()
further premise}
+ * tested by this class.)
+ * </p>
+ *
+ * @see #validatePremise_AllSMPCodePointCaseVariantsAreSMPCodePoints()
+ * @see
#validatePremise_AllSMPCodePointCaseVariantsUseTheSameHighSurrogate()
+ */
+ @Test
+ public void
validatePremise_AllBMPCodePointCaseVariantsAreBMPCodePoints() {
+
+ System.out.println();
+ System.out.println("PREMISE: All Basic Multilingual Plane (BMP)
code-points are case-converted to BMP code-points.");
+
+ int invalidatingCases = 0;
+ int caseSensitiveCodePoints = 0;
+
+ for (int codePoint = Character.MIN_CODE_POINT; codePoint <
Character.MIN_SUPPLEMENTARY_CODE_POINT; codePoint++) {
+ final int codePointType =
Character.getType(codePoint);
+
+ if (codePointType == Character.UNASSIGNED || codePointType ==
Character.PRIVATE_USE || codePointType == Character.SURROGATE)
+ continue;
+
+ final int codePointUc =
Character.toUpperCase(codePoint );
+ final int codePointLc =
Character.toLowerCase(codePointUc);
+
+ if (codePoint == codePointUc && codePointUc == codePointLc)
+ continue;
+
+ caseSensitiveCodePoints++;
+
+ if (codePointUc >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+
+ System.out.printf(" > INVALIDATION: toUpperCase(U+%04X)
== U+%06X.%n", codePoint, codePointUc);
+ invalidatingCases++;
+
+ } else if (codePointLc >=
Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+
+ System.out.printf(" > INVALIDATION:
toLowerCase(toUpperCase(0x%04X)) == 0x%06X.%n", codePoint, codePointLc);
+ invalidatingCases++;
+ }
+ }
+
+ System.out.printf(
+ " * Of %,d BMP code-points, %,d are assigned, public and
affected by case-conversion.%n" +
+ " * Tested case variants of those %<,d BMP code-points for
Supplementary Multilingual Plane (SMP) code-points.%n" +
+ " * Found %s.%n",
+ Character.MIN_SUPPLEMENTARY_CODE_POINT -
(Character.MAX_SURROGATE - Character.MIN_SURROGATE + 1),
+ caseSensitiveCodePoints,
+ invalidatingCases == 0 ? "none" :
"%,d".formatted(invalidatingCases)
+ );
+
+ if (invalidatingCases != 0)
+ fail(String.format("TEST RESULT: Premise is false, because %,d
BMP code-points were case-converted to SMP code-points.",
invalidatingCases));
+
+ System.out.printf("TEST RESULT: Premise is valid for operative
Unicode version, because all case variants were also BMP code-points.%n%n");
+ }
+
+ @Test
+ public void validatePremise_SurrogatesAreUnchangedByCaseConversion() {
+
+ for (char surrogate = Character.MIN_SURROGATE; surrogate <=
Character.MAX_SURROGATE; surrogate++) {
+
+ assertEquals(surrogate, Character.toUpperCase(surrogate));
+ assertEquals(surrogate, Character.toLowerCase(surrogate));
+ }
+ }
+
+ private static int codePointLength
+ (
+ final int codePoint
+ ){
+ return codePoint >>> Character.SIZE == 0 ? 1 : 2;
+ }
+
+ private static int codePointCount
+ (
+ final String chars
+ ){
+ return chars.codePointCount(0, chars.length());
+ }
+
+
+ private static class TripleCaseCodePoints {
+
+ final TripleCaseCodePoint[] tccpData;
+ final int name3cCodePointsMax;
+ final int nameUcCodePointsMax;
+ final int nameLcCodePointsMax;
+
+
+ @SuppressWarnings({ "ObjectAllocationInLoop",
"LocalVariableHidesMemberVariable" })
+ TripleCaseCodePoints
+ (
+ final int[] codePoint3cArray,
+ final IntFunction<Object> codePointNameAccessor
+ ){
+ final TripleCaseCodePoint[] tccpData = new
TripleCaseCodePoint[codePoint3cArray.length];
+ int name3cCodePointsMax = 0;
+ int nameUcCodePointsMax = 0;
+ int nameLcCodePointsMax = 0;
+
+ for (int tccpIdx = 0; tccpIdx < tccpData.length; tccpIdx++) {
+ final TripleCaseCodePoint tccpDatum = new
TripleCaseCodePoint(codePoint3cArray[tccpIdx], codePointNameAccessor);
+ final int cpNameCodePoints =
codePointCount(tccpDatum.codePoint3cName);
+ final int ucNameCodePoints =
codePointCount(tccpDatum.codePointUcName);
+ final int lcNameCodePoints =
codePointCount(tccpDatum.codePointLcName);
+
+ tccpData[tccpIdx] = tccpDatum;
+
+ if (name3cCodePointsMax < cpNameCodePoints)
+ name3cCodePointsMax = cpNameCodePoints;
+ if (nameUcCodePointsMax < ucNameCodePoints)
+ nameUcCodePointsMax = ucNameCodePoints;
+ if (nameLcCodePointsMax < lcNameCodePoints)
+ nameLcCodePointsMax = lcNameCodePoints;
+ }
+
+ this.tccpData = tccpData;
+ this.name3cCodePointsMax = name3cCodePointsMax;
+ this.nameUcCodePointsMax = nameUcCodePointsMax;
+ this.nameLcCodePointsMax = nameLcCodePointsMax;
+ }
+ }
+
+
+ private static class TripleCaseCodePoint {
+
+ final int codePoint3c;
+ final String codePoint3cName;
+ final int codePointUc;
+ final String codePointUcName;
+ final int codePointLc;
+ final String codePointLcName;
+
+
+ TripleCaseCodePoint
+ (
+ final int codePoint3c,
+ final IntFunction<Object> codePointNameAccessor
+ ){
+ this.codePoint3c = codePoint3c;
+ this.codePoint3cName =
codePointNameAccessor.apply(codePoint3c).toString();
+ this.codePointUc = Character.toUpperCase(codePoint3c);
+ this.codePointUcName =
codePointNameAccessor.apply(codePointUc).toString();
+ this.codePointLc = Character.toLowerCase(codePointUc);
+ this.codePointLcName =
codePointNameAccessor.apply(codePointLc).toString();
+ }
+ }
+
+
+ private static final class StringLiteral {
+
+ private static final char[] NIBBLES =
"0123456789ABCDEF".toCharArray();
+ private static final String[] ESCAPES = new String[128];
+
+ static {
+ ESCAPES['\b'] = "\\b";
+ ESCAPES['\t'] = "\\t";
+ ESCAPES['\n'] = "\\n";
+ ESCAPES['\f'] = "\\f";
+ ESCAPES['\r'] = "\\r";
+ ESCAPES['"' ] = "\\\"";
+ ESCAPES['\\'] = "\\\\";
+ ESCAPES[' ' ] = " "; // Avoids an extra conditional test.
+ }
+
+
+ static String from
+ (
+ final int codePoint
+ ){
+ try {
+ return append(new StringBuilder(14), codePoint).toString();
+ } catch (final IOException e) {
+ throw new UncheckedIOException(e); // Operating on a
"StringBuilder", this case cannot arise.
+ }
+ }
+
+ static String from
+ (
+ final String chars
+ ){
+ try {
+ final StringBuilder output = append(new
StringBuilder(chars.length()), chars);
+
+ return chars.length() == output.length() ? chars :
output.toString();
+
+ } catch (final IOException e) {
+ throw new UncheckedIOException(e); // Operating on a
"StringBuilder", this case cannot arise.
+ }
+ }
+
+ static CharSequence from
+ (
+ final CharSequence chars
+ ){
+ try {
+ final StringBuilder output = append(new
StringBuilder(chars.length()), chars);
+
+ return chars.length() == output.length() ? chars :
output.toString();
+
+ } catch (final IOException e) {
+ throw new UncheckedIOException(e); // Operating on a
"StringBuilder", this case cannot arise.
+ }
+ }
+
+ static <T extends Appendable> T append
+ (
+ final T out,
+ final int codePoint
+ )
+ throws IOException
+ {
+ out.append("\"\\u");
+
+ if (codePoint >>> Character.SIZE == 0) {
+
+ appendHex(out, 4, 4, codePoint).append('"');
+
+ } else {
+
+ appendHex(out, 4, 4,
Character.highSurrogate(codePoint)).append("\\u");
+ appendHex(out, 4, 4, Character.lowSurrogate
(codePoint)).append('"' );
+ }
+
+ return out;
+ }
+
+ static <T extends Appendable> T append
+ (
+ final T out,
+ final CharSequence chars
+ )
+ throws IOException
+ {
+ out.append('"');
+
+ for (int charIdx = 0, charsRemaining = chars.length();
--charsRemaining >= 0; charIdx++) {
+ final char c = chars.charAt(charIdx);
+
+ if (c < ESCAPES.length) {
+ final String escape = ESCAPES[c];
+
+ if (escape != null) {
+ out.append(escape);
+ continue;
+ }
+ }
+
+ final int codePointCategory =
Character.getType(c);
+
+ if (codePointCategory == Character.UNASSIGNED // Also
includes "permanently reserved" code-points.
+ || codePointCategory >= Character.SPACE_SEPARATOR
+ && codePointCategory <= Character.SURROGATE
+ ){
+ // Represent whitespace, invisibles, Supplementary
Multilingual Plane code-points, and so
+ // forth, using 16-bit Unicode escapes. "Character" class
identifiers for those Unicode
+ // categories are:
+ // 0 - UNASSIGNED
+ // 12 - SPACE_SEPARATOR (the escape logic prevents
encoding of code-point "SPACE", U+0020)
+ // 13 - LINE_SEPARATOR
+ // 14 - PARAGRAPH_SEPARATOR
+ // 15 - CONTROL
+ // 16 - FORMAT
+ // 17 - (unused)
+ // 18 - PRIVATE_USE
+ // 19 - SURROGATE
+
+ appendHex(out.append("\\u"), 4, 4, c);
+
+ } else
+ out.append(c);
+ }
+
+ out.append('"');
+
+ return out;
+ }
+
+ @SuppressWarnings("MagicNumber")
+ private static <T extends Appendable> T appendHex
+ (
+ final T out,
+ @SuppressWarnings("SameParameterValue")
+ final int hexDigitsMin,
+ @SuppressWarnings("SameParameterValue")
+ final int hexDigitsMax,
+ final int value
+ )
+ throws IOException
+ {
+ // NOTE: If necessary, this method can be made to support 64-bit
values by changing "value"
+ // from "int" to "long", and replacing "Integer" class
references with "Long".
+
+ if (0 > hexDigitsMin || hexDigitsMin > hexDigitsMax ||
hexDigitsMax > Integer.SIZE >>> 2)
+ throw new IllegalArgumentException("Violation of parameter
value constraints: 0 < \"hexDigitsMin\" (" + hexDigitsMin + ") <
\"hexDigitsMax\" (" + hexDigitsMax + ") <= " + (Integer.SIZE >>> 2) + '.');
+
+ int shift = hexDigitsMax - 1 << 2;
+ int nibble = value >>> shift & 0xF;
+
+ if (nibble == 0 && hexDigitsMin < hexDigitsMax) {
+ int optionalLeadingZeros =
hexDigitsMax - hexDigitsMin;
+
+ do {
+
+ shift -= 4; // Ignore the current
zero nibble.
+ nibble = value >>> shift & 0xF; // Read the next
nibble.
+
+ } while (nibble == 0 && --optionalLeadingZeros > 0);
+ }
+
+ out.append(NIBBLES[nibble]);
+
+ for (shift -= 4; shift >= 0; shift -= 4)
+ out.append(NIBBLES[value >>> shift & 0xF]);
+
+ return out;
+ }
+ }
}
More information about the core-libs-dev
mailing list