123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- var util = require('util'),
- Match = require ('../match');
- /**
- * This is a superclass for the individual detectors for
- * each of the detectable members of the ISO 2022 family
- * of encodings.
- */
- function ISO_2022() {}
- ISO_2022.prototype.match = function(det) {
- /**
- * Matching function shared among the 2022 detectors JP, CN and KR
- * Counts up the number of legal an unrecognized escape sequences in
- * the sample of text, and computes a score based on the total number &
- * the proportion that fit the encoding.
- *
- *
- * @param text the byte buffer containing text to analyse
- * @param textLen the size of the text in the byte.
- * @param escapeSequences the byte escape sequences to test for.
- * @return match quality, in the range of 0-100.
- */
- var i, j;
- var escN;
- var hits = 0;
- var misses = 0;
- var shifts = 0;
- var quality;
- // TODO: refactor me
- var text = det.fInputBytes;
- var textLen = det.fInputLen;
- scanInput:
- for (i = 0; i < textLen; i++) {
- if (text[i] == 0x1b) {
- checkEscapes:
- for (escN = 0; escN < this.escapeSequences.length; escN++) {
- var seq = this.escapeSequences[escN];
- if ((textLen - i) < seq.length)
- continue checkEscapes;
- for (j = 1; j < seq.length; j++)
- if (seq[j] != text[i + j])
- continue checkEscapes;
- hits++;
- i += seq.length - 1;
- continue scanInput;
- }
- misses++;
- }
- // Shift in/out
- if (text[i] == 0x0e || text[i] == 0x0f)
- shifts++;
- }
- if (hits == 0)
- return null;
- //
- // Initial quality is based on relative proportion of recongized vs.
- // unrecognized escape sequences.
- // All good: quality = 100;
- // half or less good: quality = 0;
- // linear inbetween.
- quality = (100 * hits - 100 * misses) / (hits + misses);
- // Back off quality if there were too few escape sequences seen.
- // Include shifts in this computation, so that KR does not get penalized
- // for having only a single Escape sequence, but many shifts.
- if (hits + shifts < 5)
- quality -= (5 - (hits + shifts)) * 10;
- return quality <= 0 ? null : new Match(det, this, quality);
- };
- module.exports.ISO_2022_JP = function() {
- this.name = function() {
- return 'ISO-2022-JP';
- };
- this.escapeSequences = [
- [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
- [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
- [ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978
- [ 0x1b, 0x24, 0x41 ], // GB 2312-80
- [ 0x1b, 0x24, 0x42 ], // JIS X 208-1983
- [ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997
- [ 0x1b, 0x28, 0x42 ], // ASCII
- [ 0x1b, 0x28, 0x48 ], // JIS-Roman
- [ 0x1b, 0x28, 0x49 ], // Half-width katakana
- [ 0x1b, 0x28, 0x4a ], // JIS-Roman
- [ 0x1b, 0x2e, 0x41 ], // ISO 8859-1
- [ 0x1b, 0x2e, 0x46 ] // ISO 8859-7
- ];
- };
- util.inherits(module.exports.ISO_2022_JP, ISO_2022);
- module.exports.ISO_2022_KR = function() {
- this.name = function() {
- return 'ISO-2022-KR';
- };
- this.escapeSequences = [
- [ 0x1b, 0x24, 0x29, 0x43 ]
- ];
- };
- util.inherits(module.exports.ISO_2022_KR, ISO_2022);
- module.exports.ISO_2022_CN = function() {
- this.name = function() {
- return 'ISO-2022-CN';
- };
- this.escapeSequences = [
- [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
- [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
- [ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2
- [ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165
- [ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3
- [ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4
- [ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5
- [ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6
- [ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7
- [ 0x1b, 0x4e ], // SS2
- [ 0x1b, 0x4f ] // SS3
- ];
- };
- util.inherits(module.exports.ISO_2022_CN, ISO_2022);
|