RFR: 8254073: Tokenizer improvements (revised)

Tue Oct 6 15:01:08 UTC 2020

On Tue, 6 Oct 2020 14:37:32 GMT, Maurizio Cimadamore <mcimadamore at openjdk.org> wrote:

>> This is a full revision of https://github.com/openjdk/jdk/pull/435 which contained two 'out by one' bugs and was
>> reverted.
>> This revision contains the changes of that pull request plus:
>> 
>> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java index 39d9eadcf3a..b8425ad1ecb 100644
>> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> @@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
>>       *
>>       * Thus, to find the source position of any position, p, in the comment
>>       * string, find the index, i, of the pair whose string offset
>> -     * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
>> -     * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
>> +     * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
>> +     * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
>> +     *                                (p - map[i * NOFFSETS + SB_OFFSET]) }.
>>       */
>>      static class OffsetMap {
>>          /**
>> @@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
>>              int start = 0;
>>              int end = size / NOFFSETS;
>>  
>> -            while (start < end - NOFFSETS) {
>> +            while (start < end - 1) {
>>                  // find an index midway between start and end
>>                  int index = (start + end) / 2;
>>                  int indexScaled = index * NOFFSETS;
>> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java index 2472632dbcd..7584b79044b 100644
>> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> @@ -221,48 +221,49 @@ public class UnicodeReader {
>>      private boolean unicodeEscape() {
>>          // Start of unicode escape (past backslash.)
>>          int start = position + width;
>> -        int index;
>> +
>> +        // Default to backslash result, unless proven otherwise.
>> +        character = '\';
>> +        width = 1;
>>  
>>          // Skip multiple 'u'.
>> +        int index;
>>          for (index = start; index < length; index++) {
>>              if (buffer[index] != 'u') {
>>                  break;
>>              }
>>          }
>>  
>> -        // Needs to be at least backslash-u.
>> -        if (index != start) {
>> -            // If enough characters available.
>> -            if (index + 4 < length) {
>> -                // Convert four hex digits to codepoint. If any digit is invalid then the
>> -                // result is negative.
>> -                int code = (Character.digit(buffer[index++], 16) << 12) |
>> -                           (Character.digit(buffer[index++], 16) << 8) |
>> -                           (Character.digit(buffer[index++], 16) << 4) |
>> -                            Character.digit(buffer[index++], 16);
>> -
>> -                // If all digits are good.
>> -                if (code >= 0) {
>> -                    width = index - position;
>> -                    character = (char)code;
>> -
>> -                    return true;
>> -                }
>> -            }
>> +        // Needs to have been at least one u.
>> +        if (index == start) {
>> +            return false;
>> +        }
>>  
>> -            // Did not work out.
>> -            log.error(position, Errors.IllegalUnicodeEsc);
>> -            width = index - position;
>> +        int code = 0;
>>  
>> -            // Return true so that the invalid unicode escape is skipped.
>> -            return true;
>> +        for (int i = 0; i < 4; i++) {
>> +            int digit = Character.digit(buffer[index], 16);
>> +            code = code << 4 | digit;
>> +
>> +            if (code < 0) {
>> +                break;
>> +            }
>> +
>> +            index++;
>>          }
>>  
>> -        // Must be just a backslash.
>> -        character = '\';
>> -        width = 1;
>> +        // Skip digits even if error.
>> +        width = index - position;
>>  
>> -        return false;
>> +        // If all digits are good.
>> +        if (code >= 0) {
>> +            character = (char)code;
>> +        } else {
>> +            log.error(position, Errors.IllegalUnicodeEsc);
>> +        }
>> +
>> +        // Return true even if error so that the invalid unicode escape is skipped.
>> +        return true;
>>      }
>>  
>>      /**
>> @@ -549,7 +550,7 @@ public class UnicodeReader {
>>          /**
>>           * Offset from the beginning of the original reader buffer.
>>           */
>> -        private int offset;
>> +        final private int offset;
>>  
>>          /**
>>           * Current column in the comment.
>
> src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java line 245:
> 
>> 243:
>> 244:         for (int i = 0; i < 4; i++) {
>> 245:             int digit = Character.digit(buffer[index], 16);
> 
> This looks suspicious - what if index ends up being bigger than (or equal to)  `buffer.length` ?
> Maybe we need a test for incomplete unicode sequences at the end of the tokenizer input - e.g. `\u123`

You are correct. Will revise.

-------------

PR: https://git.openjdk.java.net/jdk/pull/525