@ -2,224 +2,234 @@
const Buffer = require ( 'buffer' ) . Buffer ;
const Buffer = require ( 'buffer' ) . Buffer ;
function assertEncoding ( encoding ) {
// Do not cache `Buffer.isEncoding` when checking encoding names as some
// Do not cache `Buffer.isEncoding`, some modules monkey-patch it to support
// modules monkey-patch it to support additional encodings
// additional encodings
function normalizeEncoding ( enc ) {
if ( encoding && ! Buffer . isEncoding ( encoding ) ) {
if ( ! enc ) return 'utf8' ;
throw new Error ( 'Unknown encoding: ' + encoding ) ;
var low ;
for ( ; ; ) {
switch ( enc ) {
case 'utf8' :
case 'utf-8' :
return 'utf8' ;
case 'ucs2' :
case 'utf16le' :
case 'ucs-2' :
case 'utf-16le' :
return 'utf16le' ;
case 'base64' :
case 'ascii' :
case 'binary' :
case 'hex' :
return enc ;
default :
if ( low ) {
if ( ! Buffer . isEncoding ( enc ) )
throw new Error ( 'Unknown encoding: ' + enc ) ;
return enc ;
}
low = true ;
enc = ( '' + enc ) . toLowerCase ( ) ;
}
}
}
}
}
// StringDecoder provides an interface for efficiently splitting a series of
// StringDecoder provides an interface for efficiently splitting a series of
// buffers into a series of JS strings without breaking apart multi-byte
// buffers into a series of JS strings without breaking apart multi-byte
// characters. CESU-8 is handled as part of the UTF-8 encoding.
// characters.
//
exports . StringDecoder = StringDecoder ;
// @TODO Handling all encodings inside a single object makes it very difficult
function StringDecoder ( encoding ) {
// to reason about this code, so it should be split up in the future.
this . encoding = normalizeEncoding ( encoding ) ;
// @TODO There should be a utf8-strict encoding that rejects invalid UTF-8 code
var nb ;
// points as used by CESU-8.
const StringDecoder = exports . StringDecoder = function ( encoding ) {
this . encoding = ( encoding || 'utf8' ) . toLowerCase ( ) . replace ( /[-_]/ , '' ) ;
assertEncoding ( encoding ) ;
switch ( this . encoding ) {
switch ( this . encoding ) {
case 'utf8' :
// CESU-8 represents each of Surrogate Pair by 3-bytes
this . surrogateSize = 3 ;
break ;
case 'ucs2' :
case 'utf16le' :
case 'utf16le' :
// UTF-16 represents each of Surrogate Pair by 2-bytes
this . text = utf16Text ;
this . surrogateSize = 2 ;
this . end = utf16End ;
this . detectIncompleteChar = utf16DetectIncompleteChar ;
// fall through
case 'utf8' :
nb = 4 ;
break ;
break ;
case 'base64' :
case 'base64' :
// Base-64 stores 3 bytes in 4 chars, and pads the remainder.
this . text = base64Text ;
this . surrogateSize = 3 ;
this . end = base64End ;
this . detectIncompleteChar = base64DetectIncompleteChar ;
nb = 3 ;
break ;
break ;
default :
default :
this . write = passThroughWrite ;
this . write = simpleWrite ;
this . end = simpleEnd ;
return ;
return ;
}
}
this . lastNeed = 0 ;
this . lastTotal = 0 ;
this . lastChar = Buffer . allocUnsafe ( nb ) ;
}
// Enough space to store all bytes of a single character. UTF-8 needs 4
StringDecoder . prototype . write = function ( buf ) {
// bytes, but CESU-8 may require up to 6 (3 bytes per surrogate).
if ( buf . length === 0 )
this . charBuffer = Buffer . allocUnsafe ( 6 ) ;
return '' ;
// Number of bytes received for the current incomplete multi-byte character.
var r ;
this . charReceived = 0 ;
var i ;
// Number of bytes expected for the current incomplete multi-byte character.
if ( this . lastNeed ) {
this . charLength = 0 ;
r = this . fillLast ( buf ) ;
} ;
if ( r === undefined )
// write decodes the given buffer and returns it as JS string that is
// guaranteed to not contain any partial multi-byte characters. Any partial
// character found at the end of the buffer is buffered up, and will be
// returned when calling write again with the remaining bytes.
//
// Note: Converting a Buffer containing an orphan surrogate to a String
// currently works, but converting a String to a Buffer (via `Buffer.from()`,
// or Buffer#write) will replace incomplete surrogates with the unicode
// replacement character. See https://codereview.chromium.org/121173009/ .
StringDecoder . prototype . write = function ( buffer ) {
var charStr = '' ;
var buflen = buffer . length ;
var charBuffer = this . charBuffer ;
var charLength = this . charLength ;
var charReceived = this . charReceived ;
var surrogateSize = this . surrogateSize ;
var encoding = this . encoding ;
var charCode ;
// if our last write ended with an incomplete multibyte character
while ( charLength ) {
// determine how many remaining bytes this buffer has to offer for this char
var diff = charLength - charReceived ;
var available = ( buflen >= diff ) ? diff : buflen ;
// add the new bytes to the char buffer
buffer . copy ( charBuffer , charReceived , 0 , available ) ;
charReceived += available ;
if ( charReceived < charLength ) {
// still not enough chars in this buffer? wait for more ...
this . charLength = charLength ;
this . charReceived = charReceived ;
return '' ;
return '' ;
}
i = this . lastNeed ;
this . lastNeed = 0 ;
// remove bytes belonging to the current character from the buffer
} else {
buffer = buffer . slice ( available , buflen ) ;
i = 0 ;
buflen = buffer . length ;
}
if ( i < buf . length )
// get the character that was split
return ( r ? r + this . text ( buf , i ) : this . text ( buf , i ) ) ;
charStr = charBuffer . toString ( encoding , 0 , charLength ) ;
return r || '' ;
} ;
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
StringDecoder . prototype . end = utf8End ;
charCode = charStr . charCodeAt ( charStr . length - 1 ) ;
if ( charCode >= 0xD800 && charCode <= 0xDBFF ) {
charLength += surrogateSize ;
charStr = '' ;
continue ;
}
charReceived = charLength = 0 ;
// if there are no more bytes in this buffer, just emit our char
// Returns only complete characters in a Buffer
if ( buflen === 0 ) {
StringDecoder . prototype . text = utf8Text ;
this . charLength = charLength ;
this . charReceived = charReceived ;
return charStr ;
// Attempts to complete a partial character using bytes from a Buffer
}
StringDecoder . prototype . fillLast = function ( buf ) {
if ( this . lastNeed <= buf . length ) {
buf . copy ( this . lastChar , this . lastTotal - this . lastNeed , 0 , this . lastNeed ) ;
return this . lastChar . toString ( this . encoding , 0 , this . lastTotal ) ;
}
}
buf . copy ( this . lastChar , this . lastTotal - this . lastNeed , 0 , buf . length ) ;
this . lastNeed -= buf . length ;
} ;
// determine and set charLength / charReceived
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
if ( this . detectIncompleteChar ( buffer ) )
// continuation byte.
charLength = this . charLength ;
function utf8CheckByte ( byte ) {
charReceived = this . charReceived ;
if ( byte <= 0x7F )
return 0 ;
else if ( byte >> 5 === 0x06 )
return 2 ;
else if ( byte >> 4 === 0x0E )
return 3 ;
else if ( byte >> 3 === 0x1E )
return 4 ;
return - 1 ;
}
var end = buflen ;
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
if ( charLength ) {
// character, returning the total number of bytes needed to complete the partial
// buffer the incomplete character bytes we got
// character (if applicable).
buffer . copy ( charBuffer , 0 , buflen - charReceived , end ) ;
function utf8CheckIncomplete ( self , buf , i ) {
end -= charReceived ;
var j = buf . length - 1 ;
if ( j < i )
return 0 ;
var nb = utf8CheckByte ( buf [ j -- ] ) ;
if ( nb >= 0 ) {
if ( nb > 0 )
self . lastNeed = nb + 1 - ( buf . length - j ) ;
return nb ;
}
}
if ( j < i )
this . charLength = charLength ;
return 0 ;
charStr += buffer . toString ( encoding , 0 , end ) ;
nb = utf8CheckByte ( buf [ j -- ] ) ;
if ( nb >= 0 ) {
end = charStr . length - 1 ;
if ( nb > 0 )
charCode = charStr . charCodeAt ( end ) ;
self . lastNeed = nb + 1 - ( buf . length - j ) ;
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
return nb ;
if ( charCode >= 0xD800 && charCode <= 0xDBFF ) {
charLength += surrogateSize ;
charReceived += surrogateSize ;
charBuffer . copy ( charBuffer , surrogateSize , 0 , surrogateSize ) ;
buffer . copy ( charBuffer , 0 , 0 , surrogateSize ) ;
this . charLength = charLength ;
this . charReceived = charReceived ;
return charStr . substring ( 0 , end ) ;
}
}
if ( j < i )
return 0 ;
nb = utf8CheckByte ( buf [ j -- ] ) ;
if ( nb >= 0 ) {
if ( nb > 0 )
self . lastNeed = nb + 1 - ( buf . length - j ) ;
return nb ;
}
return 0 ;
}
// or just emit the charStr
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
return charStr ;
// partial character, the character's bytes are buffered until the required
} ;
// number of bytes are available.
function utf8Text ( buf , i ) {
// detectIncompleteChar determines if there is an incomplete UTF-8 character at
const total = utf8CheckIncomplete ( this , buf , i ) ;
// the end of the given buffer. If so, it sets this.charLength to the byte
if ( ! this . lastNeed )
// length that character, and sets this.charReceived to the number of bytes
return buf . toString ( 'utf8' , i ) ;
// that are available for this character.
this . lastTotal = total ;
StringDecoder . prototype . detectIncompleteChar = function ( buffer ) {
const end = buf . length - ( total - this . lastNeed ) ;
var buflen = buffer . length ;
buf . copy ( this . lastChar , 0 , end ) ;
// determine how many bytes we have to check at the end of this buffer
return buf . toString ( 'utf8' , i , end ) ;
var i = ( buflen >= 3 ) ? 3 : buflen ;
}
var newlen = false ;
// Figure out if one of the last i bytes of our buffer announces an
// incomplete char.
for ( ; i > 0 ; i -- ) {
var c = buffer [ buflen - i ] ;
// See http://en.wikipedia.org/wiki/UTF-8#Description
// 110XXXXX
if ( i === 1 && c >> 5 === 0x06 ) {
this . charLength = 2 ;
newlen = true ;
break ;
}
// 1110XXXX
// For UTF-8, a replacement character for each buffered byte of a (partial)
if ( i <= 2 && c >> 4 === 0x0E ) {
// character needs to be added to the output.
this . charLength = 3 ;
function utf8End ( buf ) {
newlen = true ;
const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
break ;
if ( this . lastNeed )
}
return r + '\ufffd' . repeat ( this . lastTotal - this . lastNeed ) ;
return r ;
}
// 11110XXX
// UTF-16LE typically needs two bytes per character, but even if we have an even
if ( i <= 3 && c >> 3 === 0x1E ) {
// number of bytes available, we need to check if we end on a leading/high
this . charLength = 4 ;
// surrogate. In that case, we need to wait for the next two bytes in order to
newlen = true ;
// decode the last character properly.
break ;
function utf16Text ( buf , i ) {
if ( ( buf . length - i ) % 2 === 0 ) {
const r = buf . toString ( 'utf16le' , i ) ;
if ( r ) {
const c = r . charCodeAt ( r . length - 1 ) ;
if ( c >= 0xD800 && c <= 0xDBFF ) {
this . lastNeed = 2 ;
this . lastTotal = 4 ;
this . lastChar [ 0 ] = buf [ buf . length - 2 ] ;
this . lastChar [ 1 ] = buf [ buf . length - 1 ] ;
return r . slice ( 0 , - 1 ) ;
}
}
}
return r ;
}
}
this . lastNeed = 1 ;
this . lastTotal = 2 ;
this . lastChar [ 0 ] = buf [ buf . length - 1 ] ;
return buf . toString ( 'utf16le' , i , buf . length - 1 ) ;
}
this . charReceived = i ;
// For UTF-16LE we do not explicitly append special replacement characters if we
// end on a partial character, we simply let v8 handle that.
return newlen ;
function utf16End ( buf ) {
} ;
const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
if ( this . lastNeed ) {
const end = this . lastTotal - this . lastNeed ;
return r + this . lastChar . toString ( 'utf16le' , 0 , end ) ;
}
return r ;
}
StringDecoder . prototype . end = function ( buffer ) {
function base64Text ( buf , i ) {
var res = '' ;
const n = ( buf . length - i ) % 3 ;
if ( buffer && buffer . length )
if ( n === 0 )
res = this . write ( buffer ) ;
return buf . toString ( 'base64' , i ) ;
this . lastNeed = 3 - n ;
var charReceived = this . charReceived ;
this . lastTotal = 3 ;
if ( charReceived ) {
if ( n === 1 ) {
var cr = charReceived ;
this . lastChar [ 0 ] = buf [ buf . length - 1 ] ;
var buf = this . charBuffer ;
} else {
var enc = this . encoding ;
this . lastChar [ 0 ] = buf [ buf . length - 2 ] ;
res += buf . toString ( enc , 0 , cr ) ;
this . lastChar [ 1 ] = buf [ buf . length - 1 ] ;
}
}
return buf . toString ( 'base64' , i , buf . length - n ) ;
}
return res ;
} ;
function passThroughWrite ( buffer ) {
function base64End ( buf ) {
return buffer . toString ( this . encoding ) ;
const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
if ( this . lastNeed )
return r + this . lastChar . toString ( 'base64' , 0 , 3 - this . lastNeed ) ;
return r ;
}
}
function utf16DetectIncompleteChar ( buffer ) {
// Pass bytes on through for single-byte encodings (e.g. ascii, binary, hex)
var charReceived = this . charReceived = buffer . length % 2 ;
function simpleWrite ( buf ) {
this . charLength = charReceived ? 2 : 0 ;
return buf . toString ( this . encoding ) ;
return true ;
}
}
function base64DetectIncompleteChar ( buffer ) {
function simpleEnd ( buf ) {
var charReceived = this . charReceived = buffer . length % 3 ;
return ( buf && buf . length ? this . write ( buf ) : '' ) ;
this . charLength = charReceived ? 3 : 0 ;
return true ;
}
}