iso2022.js 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. var util = require('util'),
  2. Match = require ('../match');
  3. /**
  4. * This is a superclass for the individual detectors for
  5. * each of the detectable members of the ISO 2022 family
  6. * of encodings.
  7. */
  8. function ISO_2022() {}
  9. ISO_2022.prototype.match = function(det) {
  10. /**
  11. * Matching function shared among the 2022 detectors JP, CN and KR
  12. * Counts up the number of legal an unrecognized escape sequences in
  13. * the sample of text, and computes a score based on the total number &
  14. * the proportion that fit the encoding.
  15. *
  16. *
  17. * @param text the byte buffer containing text to analyse
  18. * @param textLen the size of the text in the byte.
  19. * @param escapeSequences the byte escape sequences to test for.
  20. * @return match quality, in the range of 0-100.
  21. */
  22. var i, j;
  23. var escN;
  24. var hits = 0;
  25. var misses = 0;
  26. var shifts = 0;
  27. var quality;
  28. // TODO: refactor me
  29. var text = det.fInputBytes;
  30. var textLen = det.fInputLen;
  31. scanInput:
  32. for (i = 0; i < textLen; i++) {
  33. if (text[i] == 0x1b) {
  34. checkEscapes:
  35. for (escN = 0; escN < this.escapeSequences.length; escN++) {
  36. var seq = this.escapeSequences[escN];
  37. if ((textLen - i) < seq.length)
  38. continue checkEscapes;
  39. for (j = 1; j < seq.length; j++)
  40. if (seq[j] != text[i + j])
  41. continue checkEscapes;
  42. hits++;
  43. i += seq.length - 1;
  44. continue scanInput;
  45. }
  46. misses++;
  47. }
  48. // Shift in/out
  49. if (text[i] == 0x0e || text[i] == 0x0f)
  50. shifts++;
  51. }
  52. if (hits == 0)
  53. return null;
  54. //
  55. // Initial quality is based on relative proportion of recongized vs.
  56. // unrecognized escape sequences.
  57. // All good: quality = 100;
  58. // half or less good: quality = 0;
  59. // linear inbetween.
  60. quality = (100 * hits - 100 * misses) / (hits + misses);
  61. // Back off quality if there were too few escape sequences seen.
  62. // Include shifts in this computation, so that KR does not get penalized
  63. // for having only a single Escape sequence, but many shifts.
  64. if (hits + shifts < 5)
  65. quality -= (5 - (hits + shifts)) * 10;
  66. return quality <= 0 ? null : new Match(det, this, quality);
  67. };
  68. module.exports.ISO_2022_JP = function() {
  69. this.name = function() {
  70. return 'ISO-2022-JP';
  71. };
  72. this.escapeSequences = [
  73. [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
  74. [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
  75. [ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978
  76. [ 0x1b, 0x24, 0x41 ], // GB 2312-80
  77. [ 0x1b, 0x24, 0x42 ], // JIS X 208-1983
  78. [ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997
  79. [ 0x1b, 0x28, 0x42 ], // ASCII
  80. [ 0x1b, 0x28, 0x48 ], // JIS-Roman
  81. [ 0x1b, 0x28, 0x49 ], // Half-width katakana
  82. [ 0x1b, 0x28, 0x4a ], // JIS-Roman
  83. [ 0x1b, 0x2e, 0x41 ], // ISO 8859-1
  84. [ 0x1b, 0x2e, 0x46 ] // ISO 8859-7
  85. ];
  86. };
  87. util.inherits(module.exports.ISO_2022_JP, ISO_2022);
  88. module.exports.ISO_2022_KR = function() {
  89. this.name = function() {
  90. return 'ISO-2022-KR';
  91. };
  92. this.escapeSequences = [
  93. [ 0x1b, 0x24, 0x29, 0x43 ]
  94. ];
  95. };
  96. util.inherits(module.exports.ISO_2022_KR, ISO_2022);
  97. module.exports.ISO_2022_CN = function() {
  98. this.name = function() {
  99. return 'ISO-2022-CN';
  100. };
  101. this.escapeSequences = [
  102. [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
  103. [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
  104. [ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2
  105. [ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165
  106. [ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3
  107. [ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4
  108. [ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5
  109. [ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6
  110. [ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7
  111. [ 0x1b, 0x4e ], // SS2
  112. [ 0x1b, 0x4f ] // SS3
  113. ];
  114. };
  115. util.inherits(module.exports.ISO_2022_CN, ISO_2022);