You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

210 lines
7.7 KiB

/*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
* Mark Pilgrim - port to Python
* Shy Shalom - original C code
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
/**
* This is a port from the python port, version "2.0.1"
*/
!function(jschardet) {
jschardet.UniversalDetector = function() {
var MINIMUM_THRESHOLD = jschardet.Constants.MINIMUM_THRESHOLD;
var _state = {
pureAscii : 0,
escAscii : 1,
highbyte : 2
};
var self = this;
function init() {
self._highBitDetector = /[\x80-\xFF]/;
self._escDetector = /(\x1B|~\{)/;
self._mEscCharsetProber = null;
self._mCharsetProbers = [];
self.reset();
}
this.reset = function() {
this.result = {"encoding": null, "confidence": 0.0};
this.done = false;
this._mStart = true;
this._mGotData = false;
this._mInputState = _state.pureAscii;
this._mLastChar = "";
this._mBOM = "";
if( this._mEscCharsetProber ) {
this._mEscCharsetProber.reset();
}
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
prober.reset();
}
}
this.feed = function(aBuf) {
if( this.done ) return;
var aLen = aBuf.length;
if( !aLen ) return;
if( !this._mGotData ) {
this._mBOM += aBuf;
// If the data starts with BOM, we know it is UTF
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" ) {
// EF BB BF UTF-8 with BOM
this.result = {"encoding": "UTF-8", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" ) {
// FF FE 00 00 UTF-32, little-endian BOM
this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" ) {
// 00 00 FE FF UTF-32, big-endian BOM
this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" ) {
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" ) {
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" ) {
// FF FE UTF-16, little endian BOM
this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" ) {
// FE FF UTF-16, big endian BOM
this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
}
// If we got to 4 chars without being able to detect a BOM we
// stop trying.
if( this._mBOM.length > 3 ) {
this._mGotData = true;
}
}
if( this.result.encoding && (this.result.confidence > 0.0) ) {
this.done = true;
return;
}
if( this._mInputState == _state.pureAscii ) {
if( this._highBitDetector.test(aBuf) ) {
this._mInputState = _state.highbyte;
} else if( this._escDetector.test(this._mLastChar + aBuf) ) {
this._mInputState = _state.escAscii;
}
}
this._mLastChar = aBuf.slice(-1);
if( this._mInputState == _state.escAscii ) {
if( !this._mEscCharsetProber ) {
this._mEscCharsetProber = new jschardet.EscCharSetProber();
}
if( this._mEscCharsetProber.feed(aBuf) == jschardet.Constants.foundIt ) {
this.result = {
"encoding": this._mEscCharsetProber.getCharsetName(),
"confidence": this._mEscCharsetProber.getConfidence()
};
this.done = true;
}
} else if( this._mInputState == _state.highbyte ) {
if( this._mCharsetProbers.length == 0 ) {
this._mCharsetProbers = [
new jschardet.MBCSGroupProber(),
new jschardet.SBCSGroupProber(),
new jschardet.Latin1Prober()
];
}
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( prober.feed(aBuf) == jschardet.Constants.foundIt ) {
this.result = {
"encoding": prober.getCharsetName(),
"confidence": prober.getConfidence()
};
this.done = true;
break;
}
}
}
}
this.close = function() {
if( this.done ) return;
if( this._mBOM.length === 0 ) {
if( jschardet.Constants._debug ) {
jschardet.log("no data received!\n");
}
return;
}
this.done = true;
if( this._mInputState == _state.pureAscii ) {
if( jschardet.Constants._debug ) {
jschardet.log("pure ascii")
}
this.result = {"encoding": "ascii", "confidence": 1.0};
return this.result;
}
if( this._mInputState == _state.highbyte ) {
var proberConfidence = null;
var maxProberConfidence = 0.0;
var maxProber = null;
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( !prober ) continue;
proberConfidence = prober.getConfidence();
if( proberConfidence > maxProberConfidence ) {
maxProberConfidence = proberConfidence;
maxProber = prober;
}
if( jschardet.Constants._debug ) {
jschardet.log(prober.getCharsetName() + " confidence " + prober.getConfidence());
}
}
if( maxProber && maxProberConfidence > MINIMUM_THRESHOLD ) {
this.result = {
"encoding": maxProber.getCharsetName(),
"confidence": maxProber.getConfidence()
};
return this.result;
}
}
if( jschardet.Constants._debug ) {
jschardet.log("no probers hit minimum threshhold\n");
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( !prober ) continue;
jschardet.log(prober.getCharsetName() + " confidence = " +
prober.getConfidence() + "\n");
}
}
}
init();
}
}(require('./init'));