// Copyright 2006-2008 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef V8_SCANNER_H_ #define V8_SCANNER_H_ #include "token.h" #include "char-predicates-inl.h" namespace v8 { namespace internal { class UTF8Buffer { public: UTF8Buffer(); ~UTF8Buffer(); void AddChar(uc32 c) { if (cursor_ <= limit_ && static_cast(c) <= unibrow::Utf8::kMaxOneByteChar) { *cursor_++ = static_cast(c); } else { AddCharSlow(c); } } void Reset() { cursor_ = data_; } int pos() const { return cursor_ - data_; } char* data() const { return data_; } private: char* data_; char* cursor_; char* limit_; int Capacity() const { return (limit_ - data_) + unibrow::Utf8::kMaxEncodedSize; } static char* ComputeLimit(char* data, int capacity) { return (data + capacity) - unibrow::Utf8::kMaxEncodedSize; } void AddCharSlow(uc32 c); }; class UTF16Buffer { public: UTF16Buffer(); virtual ~UTF16Buffer() {} virtual void PushBack(uc32 ch) = 0; // returns a value < 0 when the buffer end is reached virtual uc32 Advance() = 0; virtual void SeekForward(int pos) = 0; int pos() const { return pos_; } int size() const { return size_; } Handle SubString(int start, int end); protected: Handle data_; int pos_; int size_; }; class CharacterStreamUTF16Buffer: public UTF16Buffer { public: CharacterStreamUTF16Buffer(); virtual ~CharacterStreamUTF16Buffer() {} void Initialize(Handle data, unibrow::CharacterStream* stream); virtual void PushBack(uc32 ch); virtual uc32 Advance(); virtual void SeekForward(int pos); private: List pushback_buffer_; uc32 last_; unibrow::CharacterStream* stream_; List* pushback_buffer() { return &pushback_buffer_; } }; class TwoByteStringUTF16Buffer: public UTF16Buffer { public: TwoByteStringUTF16Buffer(); virtual ~TwoByteStringUTF16Buffer() {} void Initialize(Handle data); virtual void PushBack(uc32 ch); virtual uc32 Advance(); virtual void SeekForward(int pos); private: const uint16_t* raw_data_; }; class Scanner { public: typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; // Construction explicit Scanner(bool is_pre_parsing); // Initialize the Scanner to scan source: void Init(Handle source, unibrow::CharacterStream* stream, int position); // Returns the next token. Token::Value Next(); // One token look-ahead (past the token returned by Next()). Token::Value peek() const { return next_.token; } // Returns true if there was a line terminator before the peek'ed token. bool has_line_terminator_before_next() const { return has_line_terminator_before_next_; } struct Location { Location(int b, int e) : beg_pos(b), end_pos(e) { } Location() : beg_pos(0), end_pos(0) { } int beg_pos; int end_pos; }; // Returns the location information for the current token // (the token returned by Next()). Location location() const { return current_.location; } Location peek_location() const { return next_.location; } // Returns the literal string, if any, for the current token (the // token returned by Next()). The string is 0-terminated and in // UTF-8 format; they may contain 0-characters. Literal strings are // collected for identifiers, strings, and numbers. const char* literal_string() const { return &literals_.data()[current_.literal_pos]; } int literal_length() const { return current_.literal_end - current_.literal_pos; } Vector next_literal() const { return Vector(next_literal_string(), next_literal_length()); } // Returns the literal string for the next token (the token that // would be returned if Next() were called). const char* next_literal_string() const { return &literals_.data()[next_.literal_pos]; } // Returns the length of the next token (that would be returned if // Next() were called). int next_literal_length() const { return next_.literal_end - next_.literal_pos; } // Scans the input as a regular expression pattern, previous // character(s) must be /(=). Returns true if a pattern is scanned. bool ScanRegExpPattern(bool seen_equal); // Returns true if regexp flags are scanned (always since flags can // be empty). bool ScanRegExpFlags(); // Seek forward to the given position. This operation does not // work in general, for instance when there are pushed back // characters, but works for seeking forward until simple delimiter // tokens, which is what it is used for. void SeekForward(int pos); Handle SubString(int start_pos, int end_pos); bool stack_overflow() { return stack_overflow_; } static StaticResource* utf8_decoder() { return &utf8_decoder_; } // Tells whether the buffer contains an identifier (no escapes). // Used for checking if a property name is an identifier. static bool IsIdentifier(unibrow::CharacterStream* buffer); static unibrow::Predicate kIsIdentifierStart; static unibrow::Predicate kIsIdentifierPart; static unibrow::Predicate kIsLineTerminator; static unibrow::Predicate kIsWhiteSpace; static const int kCharacterLookaheadBufferSize = 1; private: CharacterStreamUTF16Buffer char_stream_buffer_; TwoByteStringUTF16Buffer two_byte_string_buffer_; // Source. UTF16Buffer* source_; int position_; // Buffer to hold literal values (identifiers, strings, numbers) // using 0-terminated UTF-8 encoding. UTF8Buffer literals_; bool stack_overflow_; static StaticResource utf8_decoder_; // One Unicode character look-ahead; c0_ < 0 at the end of the input. uc32 c0_; // The current and look-ahead token. struct TokenDesc { Token::Value token; Location location; int literal_pos, literal_end; }; TokenDesc current_; // desc for current token (as returned by Next()) TokenDesc next_; // desc for next token (one token look-ahead) bool has_line_terminator_before_next_; bool is_pre_parsing_; // Literal buffer support void StartLiteral(); void AddChar(uc32 ch); void AddCharAdvance(); void TerminateLiteral(); // Low-level scanning support. void Advance() { c0_ = source_->Advance(); } void PushBack(uc32 ch) { source_->PushBack(ch); c0_ = ch; } bool SkipWhiteSpace(); Token::Value SkipSingleLineComment(); Token::Value SkipMultiLineComment(); inline Token::Value Select(Token::Value tok); inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); void Scan(); void ScanDecimalDigits(); Token::Value ScanNumber(bool seen_period); Token::Value ScanIdentifier(); uc32 ScanHexEscape(uc32 c, int length); uc32 ScanOctalEscape(uc32 c, int length); void ScanEscape(); Token::Value ScanString(); // Scans a possible HTML comment -- begins with 'pos() - kCharacterLookaheadBufferSize + position_; } // Decodes a unicode escape-sequence which is part of an identifier. // If the escape sequence cannot be decoded the result is kBadRune. uc32 ScanIdentifierUnicodeEscape(); }; } } // namespace v8::internal #endif // V8_SCANNER_H_