RFR: 8254073: Tokenizer improvements (revised)

Tue Oct 6 14:07:15 UTC 2020

This is a full revision of https://github.com/openjdk/jdk/pull/435 which contained two 'out by one' bugs and was
reverted.

This revision contains the changes of that pull request plus:

diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java index 39d9eadcf3a..b8425ad1ecb 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
@@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
      *
      * Thus, to find the source position of any position, p, in the comment
      * string, find the index, i, of the pair whose string offset
-     * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
-     * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
+     * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
+     * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
+     *                                (p - map[i * NOFFSETS + SB_OFFSET]) }.
      */
     static class OffsetMap {
         /**
@@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
             int start = 0;
             int end = size / NOFFSETS;
 
-            while (start < end - NOFFSETS) {
+            while (start < end - 1) {
                 // find an index midway between start and end
                 int index = (start + end) / 2;
                 int indexScaled = index * NOFFSETS;
diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java index 2472632dbcd..7584b79044b 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
@@ -221,48 +221,49 @@ public class UnicodeReader {
     private boolean unicodeEscape() {
         // Start of unicode escape (past backslash.)
         int start = position + width;
-        int index;
+
+        // Default to backslash result, unless proven otherwise.
+        character = '\';
+        width = 1;
 
         // Skip multiple 'u'.
+        int index;
         for (index = start; index < length; index++) {
             if (buffer[index] != 'u') {
                 break;
             }
         }
 
-        // Needs to be at least backslash-u.
-        if (index != start) {
-            // If enough characters available.
-            if (index + 4 < length) {
-                // Convert four hex digits to codepoint. If any digit is invalid then the
-                // result is negative.
-                int code = (Character.digit(buffer[index++], 16) << 12) |
-                           (Character.digit(buffer[index++], 16) << 8) |
-                           (Character.digit(buffer[index++], 16) << 4) |
-                            Character.digit(buffer[index++], 16);
-
-                // If all digits are good.
-                if (code >= 0) {
-                    width = index - position;
-                    character = (char)code;
-
-                    return true;
-                }
-            }
+        // Needs to have been at least one u.
+        if (index == start) {
+            return false;
+        }
 
-            // Did not work out.
-            log.error(position, Errors.IllegalUnicodeEsc);
-            width = index - position;
+        int code = 0;
 
-            // Return true so that the invalid unicode escape is skipped.
-            return true;
+        for (int i = 0; i < 4; i++) {
+            int digit = Character.digit(buffer[index], 16);
+            code = code << 4 | digit;
+
+            if (code < 0) {
+                break;
+            }
+
+            index++;
         }
 
-        // Must be just a backslash.
-        character = '\';
-        width = 1;
+        // Skip digits even if error.
+        width = index - position;
 
-        return false;
+        // If all digits are good.
+        if (code >= 0) {
+            character = (char)code;
+        } else {
+            log.error(position, Errors.IllegalUnicodeEsc);
+        }
+
+        // Return true even if error so that the invalid unicode escape is skipped.
+        return true;
     }
 
     /**
@@ -549,7 +550,7 @@ public class UnicodeReader {
         /**
          * Offset from the beginning of the original reader buffer.
          */
-        private int offset;
+        final private int offset;
 
         /**
          * Current column in the comment.

-------------

Commit messages:
 - Merge branch 'master' into 8254073
 - Merge branch 'master' into 8254073
 - 8254073: Tokenizer improvements (revised)

Changes: https://git.openjdk.java.net/jdk/pull/525/files
 Webrev: https://webrevs.openjdk.java.net/?repo=jdk&pr=525&range=00
  Issue: https://bugs.openjdk.java.net/browse/JDK-8254073
  Stats: 2349 lines in 18 files changed: 1115 ins; 597 del; 637 mod
  Patch: https://git.openjdk.java.net/jdk/pull/525.diff
  Fetch: git fetch https://git.openjdk.java.net/jdk pull/525/head:pull/525

PR: https://git.openjdk.java.net/jdk/pull/525