RFR: 8254073: Tokenizer improvements (revised)
Jim Laskey
jlaskey at openjdk.java.net
Tue Oct 6 15:01:08 UTC 2020
On Tue, 6 Oct 2020 14:37:32 GMT, Maurizio Cimadamore <mcimadamore at openjdk.org> wrote:
>> This is a full revision of https://github.com/openjdk/jdk/pull/435 which contained two 'out by one' bugs and was
>> reverted.
>> This revision contains the changes of that pull request plus:
>>
>> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java index 39d9eadcf3a..b8425ad1ecb 100644
>> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
>> @@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
>> *
>> * Thus, to find the source position of any position, p, in the comment
>> * string, find the index, i, of the pair whose string offset
>> - * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
>> - * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
>> + * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
>> + * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
>> + * (p - map[i * NOFFSETS + SB_OFFSET]) }.
>> */
>> static class OffsetMap {
>> /**
>> @@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
>> int start = 0;
>> int end = size / NOFFSETS;
>>
>> - while (start < end - NOFFSETS) {
>> + while (start < end - 1) {
>> // find an index midway between start and end
>> int index = (start + end) / 2;
>> int indexScaled = index * NOFFSETS;
>> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java index 2472632dbcd..7584b79044b 100644
>> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
>> @@ -221,48 +221,49 @@ public class UnicodeReader {
>> private boolean unicodeEscape() {
>> // Start of unicode escape (past backslash.)
>> int start = position + width;
>> - int index;
>> +
>> + // Default to backslash result, unless proven otherwise.
>> + character = '\';
>> + width = 1;
>>
>> // Skip multiple 'u'.
>> + int index;
>> for (index = start; index < length; index++) {
>> if (buffer[index] != 'u') {
>> break;
>> }
>> }
>>
>> - // Needs to be at least backslash-u.
>> - if (index != start) {
>> - // If enough characters available.
>> - if (index + 4 < length) {
>> - // Convert four hex digits to codepoint. If any digit is invalid then the
>> - // result is negative.
>> - int code = (Character.digit(buffer[index++], 16) << 12) |
>> - (Character.digit(buffer[index++], 16) << 8) |
>> - (Character.digit(buffer[index++], 16) << 4) |
>> - Character.digit(buffer[index++], 16);
>> -
>> - // If all digits are good.
>> - if (code >= 0) {
>> - width = index - position;
>> - character = (char)code;
>> -
>> - return true;
>> - }
>> - }
>> + // Needs to have been at least one u.
>> + if (index == start) {
>> + return false;
>> + }
>>
>> - // Did not work out.
>> - log.error(position, Errors.IllegalUnicodeEsc);
>> - width = index - position;
>> + int code = 0;
>>
>> - // Return true so that the invalid unicode escape is skipped.
>> - return true;
>> + for (int i = 0; i < 4; i++) {
>> + int digit = Character.digit(buffer[index], 16);
>> + code = code << 4 | digit;
>> +
>> + if (code < 0) {
>> + break;
>> + }
>> +
>> + index++;
>> }
>>
>> - // Must be just a backslash.
>> - character = '\';
>> - width = 1;
>> + // Skip digits even if error.
>> + width = index - position;
>>
>> - return false;
>> + // If all digits are good.
>> + if (code >= 0) {
>> + character = (char)code;
>> + } else {
>> + log.error(position, Errors.IllegalUnicodeEsc);
>> + }
>> +
>> + // Return true even if error so that the invalid unicode escape is skipped.
>> + return true;
>> }
>>
>> /**
>> @@ -549,7 +550,7 @@ public class UnicodeReader {
>> /**
>> * Offset from the beginning of the original reader buffer.
>> */
>> - private int offset;
>> + final private int offset;
>>
>> /**
>> * Current column in the comment.
>
> src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java line 245:
>
>> 243:
>> 244: for (int i = 0; i < 4; i++) {
>> 245: int digit = Character.digit(buffer[index], 16);
>
> This looks suspicious - what if index ends up being bigger than (or equal to) `buffer.length` ?
> Maybe we need a test for incomplete unicode sequences at the end of the tokenizer input - e.g. `\u123`
You are correct. Will revise.
-------------
PR: https://git.openjdk.java.net/jdk/pull/525
More information about the compiler-dev
mailing list