RFR: 8254073: Tokenizer improvements (revised)

Tue Oct 6 14:41:07 UTC 2020

On Tue, 6 Oct 2020 14:01:11 GMT, Jim Laskey <jlaskey at openjdk.org> wrote:

> This is a full revision of https://github.com/openjdk/jdk/pull/435 which contained two 'out by one' bugs and was
> reverted.
> This revision contains the changes of that pull request plus:
> 
> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java index 39d9eadcf3a..b8425ad1ecb 100644
> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> @@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
>       *
>       * Thus, to find the source position of any position, p, in the comment
>       * string, find the index, i, of the pair whose string offset
> -     * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
> -     * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
> +     * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
> +     * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
> +     *                                (p - map[i * NOFFSETS + SB_OFFSET]) }.
>       */
>      static class OffsetMap {
>          /**
> @@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
>              int start = 0;
>              int end = size / NOFFSETS;
>  
> -            while (start < end - NOFFSETS) {
> +            while (start < end - 1) {
>                  // find an index midway between start and end
>                  int index = (start + end) / 2;
>                  int indexScaled = index * NOFFSETS;
> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java index 2472632dbcd..7584b79044b 100644
> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> @@ -221,48 +221,49 @@ public class UnicodeReader {
>      private boolean unicodeEscape() {
>          // Start of unicode escape (past backslash.)
>          int start = position + width;
> -        int index;
> +
> +        // Default to backslash result, unless proven otherwise.
> +        character = '\';
> +        width = 1;
>  
>          // Skip multiple 'u'.
> +        int index;
>          for (index = start; index < length; index++) {
>              if (buffer[index] != 'u') {
>                  break;
>              }
>          }
>  
> -        // Needs to be at least backslash-u.
> -        if (index != start) {
> -            // If enough characters available.
> -            if (index + 4 < length) {
> -                // Convert four hex digits to codepoint. If any digit is invalid then the
> -                // result is negative.
> -                int code = (Character.digit(buffer[index++], 16) << 12) |
> -                           (Character.digit(buffer[index++], 16) << 8) |
> -                           (Character.digit(buffer[index++], 16) << 4) |
> -                            Character.digit(buffer[index++], 16);
> -
> -                // If all digits are good.
> -                if (code >= 0) {
> -                    width = index - position;
> -                    character = (char)code;
> -
> -                    return true;
> -                }
> -            }
> +        // Needs to have been at least one u.
> +        if (index == start) {
> +            return false;
> +        }
>  
> -            // Did not work out.
> -            log.error(position, Errors.IllegalUnicodeEsc);
> -            width = index - position;
> +        int code = 0;
>  
> -            // Return true so that the invalid unicode escape is skipped.
> -            return true;
> +        for (int i = 0; i < 4; i++) {
> +            int digit = Character.digit(buffer[index], 16);
> +            code = code << 4 | digit;
> +
> +            if (code < 0) {
> +                break;
> +            }
> +
> +            index++;
>          }
>  
> -        // Must be just a backslash.
> -        character = '\';
> -        width = 1;
> +        // Skip digits even if error.
> +        width = index - position;
>  
> -        return false;
> +        // If all digits are good.
> +        if (code >= 0) {
> +            character = (char)code;
> +        } else {
> +            log.error(position, Errors.IllegalUnicodeEsc);
> +        }
> +
> +        // Return true even if error so that the invalid unicode escape is skipped.
> +        return true;
>      }
>  
>      /**
> @@ -549,7 +550,7 @@ public class UnicodeReader {
>          /**
>           * Offset from the beginning of the original reader buffer.
>           */
> -        private int offset;
> +        final private int offset;
>  
>          /**
>           * Current column in the comment.

src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java line 245:

> 243:
> 244:         for (int i = 0; i < 4; i++) {
> 245:             int digit = Character.digit(buffer[index], 16);

This looks suspicious - what if index ends up being bigger than (or equal to)  `buffer.length` ?
Maybe we need a test for incomplete unicode sequences at the end of the tokenizer input - e.g. `\u123`

-------------

PR: https://git.openjdk.java.net/jdk/pull/525