mirror of https://github.com/lukechilds/node.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
210 lines
7.7 KiB
210 lines
7.7 KiB
/*
|
|
* The Original Code is Mozilla Universal charset detector code.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Netscape Communications Corporation.
|
|
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
|
|
* Mark Pilgrim - port to Python
|
|
* Shy Shalom - original C code
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
* 02110-1301 USA
|
|
*/
|
|
|
|
/**
|
|
* This is a port from the python port, version "2.0.1"
|
|
*/
|
|
|
|
!function(jschardet) {
|
|
|
|
jschardet.UniversalDetector = function() {
|
|
var MINIMUM_THRESHOLD = jschardet.Constants.MINIMUM_THRESHOLD;
|
|
var _state = {
|
|
pureAscii : 0,
|
|
escAscii : 1,
|
|
highbyte : 2
|
|
};
|
|
var self = this;
|
|
|
|
function init() {
|
|
self._highBitDetector = /[\x80-\xFF]/;
|
|
self._escDetector = /(\x1B|~\{)/;
|
|
self._mEscCharsetProber = null;
|
|
self._mCharsetProbers = [];
|
|
self.reset();
|
|
}
|
|
|
|
this.reset = function() {
|
|
this.result = {"encoding": null, "confidence": 0.0};
|
|
this.done = false;
|
|
this._mStart = true;
|
|
this._mGotData = false;
|
|
this._mInputState = _state.pureAscii;
|
|
this._mLastChar = "";
|
|
this._mBOM = "";
|
|
if( this._mEscCharsetProber ) {
|
|
this._mEscCharsetProber.reset();
|
|
}
|
|
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
|
|
prober.reset();
|
|
}
|
|
}
|
|
|
|
this.feed = function(aBuf) {
|
|
if( this.done ) return;
|
|
|
|
var aLen = aBuf.length;
|
|
if( !aLen ) return;
|
|
|
|
if( !this._mGotData ) {
|
|
this._mBOM += aBuf;
|
|
// If the data starts with BOM, we know it is UTF
|
|
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" ) {
|
|
// EF BB BF UTF-8 with BOM
|
|
this.result = {"encoding": "UTF-8", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" ) {
|
|
// FF FE 00 00 UTF-32, little-endian BOM
|
|
this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" ) {
|
|
// 00 00 FE FF UTF-32, big-endian BOM
|
|
this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" ) {
|
|
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
|
this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" ) {
|
|
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
|
this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" ) {
|
|
// FF FE UTF-16, little endian BOM
|
|
this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
|
|
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" ) {
|
|
// FE FF UTF-16, big endian BOM
|
|
this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
|
|
}
|
|
|
|
// If we got to 4 chars without being able to detect a BOM we
|
|
// stop trying.
|
|
if( this._mBOM.length > 3 ) {
|
|
this._mGotData = true;
|
|
}
|
|
}
|
|
|
|
if( this.result.encoding && (this.result.confidence > 0.0) ) {
|
|
this.done = true;
|
|
return;
|
|
}
|
|
|
|
if( this._mInputState == _state.pureAscii ) {
|
|
if( this._highBitDetector.test(aBuf) ) {
|
|
this._mInputState = _state.highbyte;
|
|
} else if( this._escDetector.test(this._mLastChar + aBuf) ) {
|
|
this._mInputState = _state.escAscii;
|
|
}
|
|
}
|
|
|
|
this._mLastChar = aBuf.slice(-1);
|
|
|
|
if( this._mInputState == _state.escAscii ) {
|
|
if( !this._mEscCharsetProber ) {
|
|
this._mEscCharsetProber = new jschardet.EscCharSetProber();
|
|
}
|
|
if( this._mEscCharsetProber.feed(aBuf) == jschardet.Constants.foundIt ) {
|
|
this.result = {
|
|
"encoding": this._mEscCharsetProber.getCharsetName(),
|
|
"confidence": this._mEscCharsetProber.getConfidence()
|
|
};
|
|
this.done = true;
|
|
}
|
|
} else if( this._mInputState == _state.highbyte ) {
|
|
if( this._mCharsetProbers.length == 0 ) {
|
|
this._mCharsetProbers = [
|
|
new jschardet.MBCSGroupProber(),
|
|
new jschardet.SBCSGroupProber(),
|
|
new jschardet.Latin1Prober()
|
|
];
|
|
}
|
|
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
|
|
if( prober.feed(aBuf) == jschardet.Constants.foundIt ) {
|
|
this.result = {
|
|
"encoding": prober.getCharsetName(),
|
|
"confidence": prober.getConfidence()
|
|
};
|
|
this.done = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
this.close = function() {
|
|
if( this.done ) return;
|
|
if( this._mBOM.length === 0 ) {
|
|
if( jschardet.Constants._debug ) {
|
|
jschardet.log("no data received!\n");
|
|
}
|
|
return;
|
|
}
|
|
this.done = true;
|
|
|
|
if( this._mInputState == _state.pureAscii ) {
|
|
if( jschardet.Constants._debug ) {
|
|
jschardet.log("pure ascii")
|
|
}
|
|
this.result = {"encoding": "ascii", "confidence": 1.0};
|
|
return this.result;
|
|
}
|
|
|
|
if( this._mInputState == _state.highbyte ) {
|
|
var proberConfidence = null;
|
|
var maxProberConfidence = 0.0;
|
|
var maxProber = null;
|
|
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
|
|
if( !prober ) continue;
|
|
proberConfidence = prober.getConfidence();
|
|
if( proberConfidence > maxProberConfidence ) {
|
|
maxProberConfidence = proberConfidence;
|
|
maxProber = prober;
|
|
}
|
|
if( jschardet.Constants._debug ) {
|
|
jschardet.log(prober.getCharsetName() + " confidence " + prober.getConfidence());
|
|
}
|
|
}
|
|
if( maxProber && maxProberConfidence > MINIMUM_THRESHOLD ) {
|
|
this.result = {
|
|
"encoding": maxProber.getCharsetName(),
|
|
"confidence": maxProber.getConfidence()
|
|
};
|
|
return this.result;
|
|
}
|
|
}
|
|
|
|
if( jschardet.Constants._debug ) {
|
|
jschardet.log("no probers hit minimum threshhold\n");
|
|
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
|
|
if( !prober ) continue;
|
|
jschardet.log(prober.getCharsetName() + " confidence = " +
|
|
prober.getConfidence() + "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
init();
|
|
}
|
|
|
|
}(require('./init'));
|
|
|