RFR: 8254073: Tokenizer improvements (revised)
Maurizio Cimadamore
mcimadamore at openjdk.java.net
Tue Oct 6 14:41:07 UTC 2020
On Tue, 6 Oct 2020 14:01:11 GMT, Jim Laskey <jlaskey at openjdk.org> wrote:
> This is a full revision of https://github.com/openjdk/jdk/pull/435 which contained two 'out by one' bugs and was
> reverted.
> This revision contains the changes of that pull request plus:
>
> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java index 39d9eadcf3a..b8425ad1ecb 100644
> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
> @@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
> *
> * Thus, to find the source position of any position, p, in the comment
> * string, find the index, i, of the pair whose string offset
> - * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
> - * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
> + * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
> + * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
> + * (p - map[i * NOFFSETS + SB_OFFSET]) }.
> */
> static class OffsetMap {
> /**
> @@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
> int start = 0;
> int end = size / NOFFSETS;
>
> - while (start < end - NOFFSETS) {
> + while (start < end - 1) {
> // find an index midway between start and end
> int index = (start + end) / 2;
> int indexScaled = index * NOFFSETS;
> diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java index 2472632dbcd..7584b79044b 100644
> --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
> @@ -221,48 +221,49 @@ public class UnicodeReader {
> private boolean unicodeEscape() {
> // Start of unicode escape (past backslash.)
> int start = position + width;
> - int index;
> +
> + // Default to backslash result, unless proven otherwise.
> + character = '\';
> + width = 1;
>
> // Skip multiple 'u'.
> + int index;
> for (index = start; index < length; index++) {
> if (buffer[index] != 'u') {
> break;
> }
> }
>
> - // Needs to be at least backslash-u.
> - if (index != start) {
> - // If enough characters available.
> - if (index + 4 < length) {
> - // Convert four hex digits to codepoint. If any digit is invalid then the
> - // result is negative.
> - int code = (Character.digit(buffer[index++], 16) << 12) |
> - (Character.digit(buffer[index++], 16) << 8) |
> - (Character.digit(buffer[index++], 16) << 4) |
> - Character.digit(buffer[index++], 16);
> -
> - // If all digits are good.
> - if (code >= 0) {
> - width = index - position;
> - character = (char)code;
> -
> - return true;
> - }
> - }
> + // Needs to have been at least one u.
> + if (index == start) {
> + return false;
> + }
>
> - // Did not work out.
> - log.error(position, Errors.IllegalUnicodeEsc);
> - width = index - position;
> + int code = 0;
>
> - // Return true so that the invalid unicode escape is skipped.
> - return true;
> + for (int i = 0; i < 4; i++) {
> + int digit = Character.digit(buffer[index], 16);
> + code = code << 4 | digit;
> +
> + if (code < 0) {
> + break;
> + }
> +
> + index++;
> }
>
> - // Must be just a backslash.
> - character = '\';
> - width = 1;
> + // Skip digits even if error.
> + width = index - position;
>
> - return false;
> + // If all digits are good.
> + if (code >= 0) {
> + character = (char)code;
> + } else {
> + log.error(position, Errors.IllegalUnicodeEsc);
> + }
> +
> + // Return true even if error so that the invalid unicode escape is skipped.
> + return true;
> }
>
> /**
> @@ -549,7 +550,7 @@ public class UnicodeReader {
> /**
> * Offset from the beginning of the original reader buffer.
> */
> - private int offset;
> + final private int offset;
>
> /**
> * Current column in the comment.
src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java line 245:
> 243:
> 244: for (int i = 0; i < 4; i++) {
> 245: int digit = Character.digit(buffer[index], 16);
This looks suspicious - what if index ends up being bigger than (or equal to) `buffer.length` ?
Maybe we need a test for incomplete unicode sequences at the end of the tokenizer input - e.g. `\u123`
-------------
PR: https://git.openjdk.java.net/jdk/pull/525
More information about the compiler-dev
mailing list