deps: revert breaking UTF-8 decoder changes in V8

Refs: 7c462455b0 Refs: aadb1c83fc PR-URL: https://github.com/nodejs/node/pull/11029 Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Myles Borins <myles.borins@gmail.com>
8 years ago · 028bb632b2
6 changed files with 302 additions and 310 deletions
--- a/deps/v8/src/unicode-decoder.h
+++ b/deps/v8/src/unicode-decoder.h
@ -7,11 +7,10 @@

 #include <sys/types.h>
 #include "src/globals.h"
-#include "src/utils.h"

 namespace unibrow {

-class V8_EXPORT_PRIVATE Utf8DecoderBase {
+class Utf8DecoderBase {
 public:
  // Initialization done in subclass.
  inline Utf8DecoderBase();
--- a/deps/v8/src/unicode.cc
+++ b/deps/v8/src/unicode.cc
@ -228,52 +228,80 @@ static inline bool IsContinuationCharacter(byte chr) {
 // This method decodes an UTF-8 value according to RFC 3629.
 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  size_t length = NonASCIISequenceLength(str[0]);
-
-  // Check continuation characters.
-  size_t max_count = std::min(length, max_length);
-  size_t count = 1;
-  while (count < max_count && IsContinuationCharacter(str[count])) {
-    count++;
+  if (length == 0 || max_length < length) {
+    *cursor += 1;
+    return kBadChar;
  }
-  *cursor += count;
-
-  // There must be enough continuation characters.
-  if (count != length) return kBadChar;
-
-  // Check overly long sequences & other conditions.
-  if (length == 3) {
-    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
-      // Overlong three-byte sequence?
+  if (length == 2) {
+    if (!IsContinuationCharacter(str[1])) {
+      *cursor += 1;
      return kBadChar;
-    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
-      // High and low surrogate halves?
+    }
+    *cursor += 2;
+    return ((str[0] << 6) + str[1]) - 0x00003080;
+  }
+  if (length == 3) {
+    switch (str[0]) {
+      case 0xE0:
+        // Overlong three-byte sequence.
+        if (str[1] < 0xA0 || str[1] > 0xBF) {
+          *cursor += 1;
+          return kBadChar;
+        }
+        break;
+      case 0xED:
+        // High and low surrogate halves.
+        if (str[1] < 0x80 || str[1] > 0x9F) {
+          *cursor += 1;
+          return kBadChar;
+        }
+        break;
+      default:
+        if (!IsContinuationCharacter(str[1])) {
+          *cursor += 1;
+          return kBadChar;
+        }
+    }
+    if (!IsContinuationCharacter(str[2])) {
+      *cursor += 1;
      return kBadChar;
    }
-  } else if (length == 4) {
-    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
+    *cursor += 3;
+    return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
+  }
+  DCHECK(length == 4);
+  switch (str[0]) {
+    case 0xF0:
      // Overlong four-byte sequence.
-      return kBadChar;
-    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
+      if (str[1] < 0x90 || str[1] > 0xBF) {
+        *cursor += 1;
+        return kBadChar;
+      }
+      break;
+    case 0xF4:
      // Code points outside of the unicode range.
-      return kBadChar;
-    }
+      if (str[1] < 0x80 || str[1] > 0x8F) {
+        *cursor += 1;
+        return kBadChar;
+      }
+      break;
+    default:
+      if (!IsContinuationCharacter(str[1])) {
+        *cursor += 1;
+        return kBadChar;
+      }
  }
-
-  // All errors have been handled, so we only have to assemble the result.
-  switch (length) {
-    case 1:
-      return str[0];
-    case 2:
-      return ((str[0] << 6) + str[1]) - 0x00003080;
-    case 3:
-      return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
-    case 4:
-      return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
-             0x03C82080;
+  if (!IsContinuationCharacter(str[2])) {
+    *cursor += 1;
+    return kBadChar;
  }
-
-  UNREACHABLE();
-  return kBadChar;
+  if (!IsContinuationCharacter(str[3])) {
+    *cursor += 1;
+    return kBadChar;
+  }
+  *cursor += 4;
+  return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
+         0x03C82080;
 }

 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
@ -295,10 +323,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
      // with one shift.
      uint8_t mask = 0x7f >> kind;

-      // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
-      // in 2nd nibble, and the value  in the bottom three. The 2nd nibble is
-      // intended as a counter about how many bytes are still needed.
-      *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
+      // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
+      // in the bottom three.
+      *buffer = (kind - 1) << 24 | (next & mask);
      return kIncomplete;
    } else {
      // No buffer, and not the start of a 1-byte char (handled at the
@ -327,19 +354,15 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
    // We're inside of a character, as described by buffer.

    // How many bytes (excluding this one) do we still expect?
-    uint8_t bytes_expected = *buffer >> 28;
-    uint8_t bytes_left = (*buffer >> 24) & 0x0f;
-    bytes_left--;
+    uint8_t count = (*buffer >> 24) - 1;
    // Update the value.
    uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
-    if (bytes_left) {
-      *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
+    if (count) {
+      *buffer = count << 24 | value;
      return kIncomplete;
    } else {
      *buffer = 0;
-      bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
-                                   (bytes_expected == 3 && value < 0x800);
-      return sequence_was_too_long ? kBadChar : value;
+      return value;
    }
  } else {
    // Within a character, but not a continuation character? Then the
--- a/deps/v8/test/cctest/test-parsing.cc
+++ b/deps/v8/test/cctest/test-parsing.cc
@ -684,26 +684,74 @@ TEST(RegExpScanning) {
  TestScanRegExp("/=?/", "=?");
 }

-static int Ucs2CharLength(unibrow::uchar c) {
-  if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
-    return 0;
-  } else if (c < 0xffff) {
-    return 1;
-  } else {
-    return 2;
-  }
-}

 static int Utf8LengthHelper(const char* s) {
-  unibrow::Utf8::Utf8IncrementalBuffer buffer(unibrow::Utf8::kBufferEmpty);
-  int length = 0;
-  for (; *s != '\0'; s++) {
-    unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(*s, &buffer);
-    length += Ucs2CharLength(tmp);
+  int len = i::StrLength(s);
+  int character_length = len;
+  for (int i = 0; i < len; i++) {
+    unsigned char c = s[i];
+    int input_offset = 0;
+    int output_adjust = 0;
+    if (c > 0x7f) {
+      if (c < 0xc0) continue;
+      if (c >= 0xf0) {
+        if (c >= 0xf8) {
+          // 5 and 6 byte UTF-8 sequences turn into a kBadChar for each UTF-8
+          // byte.
+          continue;  // Handle first UTF-8 byte.
+        }
+        if ((c & 7) == 0 && ((s[i + 1] & 0x30) == 0)) {
+          // This 4 byte sequence could have been coded as a 3 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 3;
+        // 4 bytes of UTF-8 turn into 2 UTF-16 code units.
+        character_length -= 2;
+      } else if (c >= 0xe0) {
+        if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {
+          // This 3 byte sequence could have been coded as a 2 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        if (c == 0xed) {
+          unsigned char d = s[i + 1];
+          if ((d < 0x80) || (d > 0x9f)) {
+            // This 3 byte sequence is part of a surrogate pair which is not
+            // supported by UTF-8. Record a single kBadChar for the first byte
+            // and continue.
+            continue;
+          }
+        }
+        input_offset = 2;
+        // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 2;
+      } else {
+        if ((c & 0x1e) == 0) {
+          // This 2 byte sequence could have been coded as a 1 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 1;
+        // 2 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 1;
+      }
+      bool bad = false;
+      for (int j = 1; j <= input_offset; j++) {
+        if ((s[i + j] & 0xc0) != 0x80) {
+          // Bad UTF-8 sequence turns the first in the sequence into kBadChar,
+          // which is a single UTF-16 code unit.
+          bad = true;
+          break;
+        }
+      }
+      if (!bad) {
+        i += input_offset;
+        character_length -= output_adjust;
+      }
+    }
  }
-  unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&buffer);
-  length += Ucs2CharLength(tmp);
-  return length;
+  return character_length;
 }


@ -933,206 +981,169 @@ TEST(ScopePositions) {
  };

  const SourceData source_data[] = {
-      {"  with ({}) ", "{ block; }", " more;", i::WITH_SCOPE, i::SLOPPY},
-      {"  with ({}) ", "{ block; }", "; more;", i::WITH_SCOPE, i::SLOPPY},
-      {"  with ({}) ",
-       "{\n"
-       "    block;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::WITH_SCOPE, i::SLOPPY},
-      {"  with ({}) ", "statement;", " more;", i::WITH_SCOPE, i::SLOPPY},
-      {"  with ({}) ", "statement",
-       "\n"
-       "  more;",
-       i::WITH_SCOPE, i::SLOPPY},
-      {"  with ({})\n"
-       "    ",
-       "statement;",
-       "\n"
-       "  more;",
-       i::WITH_SCOPE, i::SLOPPY},
-      {"  try {} catch ", "(e) { block; }", " more;", i::CATCH_SCOPE,
-       i::SLOPPY},
-      {"  try {} catch ", "(e) { block; }", "; more;", i::CATCH_SCOPE,
-       i::SLOPPY},
-      {"  try {} catch ",
-       "(e) {\n"
-       "    block;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::CATCH_SCOPE, i::SLOPPY},
-      {"  try {} catch ", "(e) { block; }", " finally { block; } more;",
-       i::CATCH_SCOPE, i::SLOPPY},
-      {"  start;\n"
-       "  ",
-       "{ let block; }", " more;", i::BLOCK_SCOPE, i::STRICT},
-      {"  start;\n"
-       "  ",
-       "{ let block; }", "; more;", i::BLOCK_SCOPE, i::STRICT},
-      {"  start;\n"
-       "  ",
-       "{\n"
-       "    let block;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  start;\n"
-       "  function fun",
-       "(a,b) { infunction; }", " more;", i::FUNCTION_SCOPE, i::SLOPPY},
-      {"  start;\n"
-       "  function fun",
-       "(a,b) {\n"
-       "    infunction;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::FUNCTION_SCOPE, i::SLOPPY},
-      {"  start;\n", "(a,b) => a + b", "; more;", i::FUNCTION_SCOPE, i::SLOPPY},
-      {"  start;\n", "(a,b) => { return a+b; }", "\nmore;", i::FUNCTION_SCOPE,
-       i::SLOPPY},
-      {"  start;\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      {"  for ", "(let x = 1 ; x < 10; ++ x) { block; }", " more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ", "(let x = 1 ; x < 10; ++ x) { block; }", "; more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ",
-       "(let x = 1 ; x < 10; ++ x) {\n"
-       "    block;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ", "(let x = 1 ; x < 10; ++ x) statement;", " more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ", "(let x = 1 ; x < 10; ++ x) statement",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ",
-       "(let x = 1 ; x < 10; ++ x)\n"
-       "    statement;",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ", "(let x in {}) { block; }", " more;", i::BLOCK_SCOPE,
-       i::STRICT},
-      {"  for ", "(let x in {}) { block; }", "; more;", i::BLOCK_SCOPE,
-       i::STRICT},
-      {"  for ",
-       "(let x in {}) {\n"
-       "    block;\n"
-       "  }",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ", "(let x in {}) statement;", " more;", i::BLOCK_SCOPE,
-       i::STRICT},
-      {"  for ", "(let x in {}) statement",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      {"  for ",
-       "(let x in {})\n"
-       "    statement;",
-       "\n"
-       "  more;",
-       i::BLOCK_SCOPE, i::STRICT},
-      // Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
-      // the preparser off in terms of byte offsets.
-      // 2 surrogates, encode a character that doesn't need a surrogate.
-      {"  'foo\355\240\201\355\260\211';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // 4 byte encoding.
-      {"  'foo\360\220\220\212';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // 3 byte encoding of \u0fff.
-      {"  'foo\340\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // 3 byte surrogate, followed by broken 2-byte surrogate w/ impossible 2nd
-      // byte and last byte missing.
-      {"  'foo\355\240\201\355\211';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 3 byte encoding of \u0fff with missing last byte.
-      {"  'foo\340\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 3 byte encoding of \u0fff with missing 2 last bytes.
-      {"  'foo\340';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
-      {"  'foo\340\203\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 3 byte encoding of \u007f should be a 2 byte encoding.
-      {"  'foo\340\201\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Unpaired lead surrogate.
-      {"  'foo\355\240\201';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Unpaired lead surrogate where following code point is a 3 byte
-      // sequence.
-      {"  'foo\355\240\201\340\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Unpaired lead surrogate where following code point is a 4 byte encoding
-      // of a trail surrogate.
-      {"  'foo\355\240\201\360\215\260\211';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Unpaired trail surrogate.
-      {"  'foo\355\260\211';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // 2 byte encoding of \u00ff.
-      {"  'foo\303\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 2 byte encoding of \u00ff with missing last byte.
-      {"  'foo\303';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Broken 2 byte encoding of \u007f should be a 1 byte encoding.
-      {"  'foo\301\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Illegal 5 byte encoding.
-      {"  'foo\370\277\277\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Illegal 6 byte encoding.
-      {"  'foo\374\277\277\277\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Illegal 0xfe byte
-      {"  'foo\376\277\277\277\277\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      // Illegal 0xff byte
-      {"  'foo\377\277\277\277\277\277\277\277';\n"
-       "  (function fun",
-       "(a,b) { infunction; }", ")();", i::FUNCTION_SCOPE, i::SLOPPY},
-      {"  'foo';\n"
-       "  (function fun",
-       "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();", i::FUNCTION_SCOPE,
-       i::SLOPPY},
-      {"  'foo';\n"
-       "  (function fun",
-       "(a,b) { 'bar\360\220\220\214'; }", ")();", i::FUNCTION_SCOPE,
-       i::SLOPPY},
-      {NULL, NULL, NULL, i::EVAL_SCOPE, i::SLOPPY}};
+    { "  with ({}) ", "{ block; }", " more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  with ({}) ", "{ block; }", "; more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  with ({}) ", "{\n"
+      "    block;\n"
+      "  }", "\n"
+      "  more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  with ({}) ", "statement;", " more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  with ({}) ", "statement", "\n"
+      "  more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  with ({})\n"
+      "    ", "statement;", "\n"
+      "  more;", i::WITH_SCOPE, i::SLOPPY },
+    { "  try {} catch ", "(e) { block; }", " more;",
+      i::CATCH_SCOPE, i::SLOPPY },
+    { "  try {} catch ", "(e) { block; }", "; more;",
+      i::CATCH_SCOPE, i::SLOPPY },
+    { "  try {} catch ", "(e) {\n"
+      "    block;\n"
+      "  }", "\n"
+      "  more;", i::CATCH_SCOPE, i::SLOPPY },
+    { "  try {} catch ", "(e) { block; }", " finally { block; } more;",
+      i::CATCH_SCOPE, i::SLOPPY },
+    { "  start;\n"
+      "  ", "{ let block; }", " more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  start;\n"
+      "  ", "{ let block; }", "; more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  start;\n"
+      "  ", "{\n"
+      "    let block;\n"
+      "  }", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  start;\n"
+      "  function fun", "(a,b) { infunction; }", " more;",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  start;\n"
+      "  function fun", "(a,b) {\n"
+      "    infunction;\n"
+      "  }", "\n"
+      "  more;", i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  start;\n", "(a,b) => a + b", "; more;",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  start;\n", "(a,b) => { return a+b; }", "\nmore;",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  start;\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  for ", "(let x = 1 ; x < 10; ++ x) { block; }", " more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x = 1 ; x < 10; ++ x) { block; }", "; more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x = 1 ; x < 10; ++ x) {\n"
+      "    block;\n"
+      "  }", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x = 1 ; x < 10; ++ x) statement;", " more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x = 1 ; x < 10; ++ x) statement", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x = 1 ; x < 10; ++ x)\n"
+      "    statement;", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {}) { block; }", " more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {}) { block; }", "; more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {}) {\n"
+      "    block;\n"
+      "  }", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {}) statement;", " more;",
+      i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {}) statement", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    { "  for ", "(let x in {})\n"
+      "    statement;", "\n"
+      "  more;", i::BLOCK_SCOPE, i::STRICT },
+    // Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
+    // the preparser off in terms of byte offsets.
+    // 6 byte encoding.
+    { "  'foo\355\240\201\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // 4 byte encoding.
+    { "  'foo\360\220\220\212';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // 3 byte encoding of \u0fff.
+    { "  'foo\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 6 byte encoding with missing last byte.
+    { "  'foo\355\240\201\355\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 3 byte encoding of \u0fff with missing last byte.
+    { "  'foo\340\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 3 byte encoding of \u0fff with missing 2 last bytes.
+    { "  'foo\340';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
+    { "  'foo\340\203\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 3 byte encoding of \u007f should be a 2 byte encoding.
+    { "  'foo\340\201\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Unpaired lead surrogate.
+    { "  'foo\355\240\201';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Unpaired lead surrogate where following code point is a 3 byte sequence.
+    { "  'foo\355\240\201\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Unpaired lead surrogate where following code point is a 4 byte encoding
+    // of a trail surrogate.
+    { "  'foo\355\240\201\360\215\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Unpaired trail surrogate.
+    { "  'foo\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // 2 byte encoding of \u00ff.
+    { "  'foo\303\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 2 byte encoding of \u00ff with missing last byte.
+    { "  'foo\303';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Broken 2 byte encoding of \u007f should be a 1 byte encoding.
+    { "  'foo\301\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Illegal 5 byte encoding.
+    { "  'foo\370\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Illegal 6 byte encoding.
+    { "  'foo\374\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Illegal 0xfe byte
+    { "  'foo\376\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    // Illegal 0xff byte
+    { "  'foo\377\277\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\360\220\220\214'; }", ")();",
+      i::FUNCTION_SCOPE, i::SLOPPY },
+    { NULL, NULL, NULL, i::EVAL_SCOPE, i::SLOPPY }
+  };

  i::Isolate* isolate = CcTest::i_isolate();
  i::Factory* factory = isolate->factory();
--- a/deps/v8/test/unittests/BUILD.gn
+++ b/deps/v8/test/unittests/BUILD.gn
@ -117,7 +117,6 @@ v8_executable("unittests") {
    "source-position-table-unittest.cc",
    "test-utils.cc",
    "test-utils.h",
-    "unicode-unittest.cc",
    "value-serializer-unittest.cc",
    "wasm/asm-types-unittest.cc",
    "wasm/ast-decoder-unittest.cc",
--- a/deps/v8/test/unittests/unicode-unittest.cc
+++ b/deps/v8/test/unittests/unicode-unittest.cc
@ -1,39 +0,0 @@
-// Copyright 2016 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include <memory>
-#include <string>
-
-#include "src/unicode-decoder.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace v8 {
-namespace internal {
-
-namespace {
-
-using Utf8Decoder = unibrow::Utf8Decoder<512>;
-
-void Decode(Utf8Decoder* decoder, const std::string& str) {
-  // Put the string in its own buffer on the heap to make sure that
-  // AddressSanitizer's heap-buffer-overflow logic can see what's going on.
-  std::unique_ptr<char[]> buffer(new char[str.length()]);
-  memcpy(buffer.get(), str.data(), str.length());
-  decoder->Reset(buffer.get(), str.length());
-}
-
-}  // namespace
-
-TEST(UnicodeTest, ReadOffEndOfUtf8String) {
-  Utf8Decoder decoder;
-
-  // Not enough continuation bytes before string ends.
-  Decode(&decoder, "\xE0");
-  Decode(&decoder, "\xED");
-  Decode(&decoder, "\xF0");
-  Decode(&decoder, "\xF4");
-}
-
-}  // namespace internal
-}  // namespace v8
--- a/deps/v8/test/unittests/unittests.gyp
+++ b/deps/v8/test/unittests/unittests.gyp
@ -115,7 +115,6 @@
      'source-position-table-unittest.cc',
      'test-utils.h',
      'test-utils.cc',
-      'unicode-unittest.cc',
      'value-serializer-unittest.cc',
      'wasm/asm-types-unittest.cc',
      'wasm/ast-decoder-unittest.cc',