@@ -4,7 +4,7 @@ import iconv from 'iconv-lite';
4
4
import os from 'os' ;
5
5
import path from 'path' ;
6
6
import * as log from './debug.js' ;
7
- import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese } from './unicode.js' ;
7
+ import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
8
8
import { CHINESE_CHARS_3500 , MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js' ;
9
9
10
10
// https://github.com/bnoordhuis/node-iconv/
@@ -32,7 +32,7 @@ export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
32
32
33
33
export const REGEX_MESSY_CJK = new RegExp ( `[${ MESSY_CJK_CHARS } ]` , 'u' )
34
34
35
- export const REGEX_MESSY_CJK_EXT = / [ \u8701 - \u883f \u9200 - \u9484 ] / u //生僻字: 虫字旁 金字旁
35
+ export const REGEX_MESSY_CJK_EXT = / [ \u8720 - \u883f \u9300 - \u9484 ] / u //生僻字: 虫字旁 金字旁
36
36
37
37
export const REGEX_MESSY_UNICODE = / [ \u007f - \u00a0 \u00c0 - \u017f \u0400 - \u1cff \u2070 - \u24ff \u0e00 - \u0e7f \u3400 - \u4dbf \uac00 - \uf8ff \ufe30 - \ufe4f \ufff0 - \uffff ] / u
38
38
@@ -52,25 +52,34 @@ export function checkBadUnicode(str) {
52
52
// 乱码标志 问号和黑问号
53
53
results . push ( [ true , 0 , `非法字符` ] )
54
54
}
55
- if ( / [ \u00c0 - \u00d6 \u00d8 - \u024f \u3100 - \u312f \ua720 - \ua7ff \uab30 - \uabff ] / u. test ( str ) ) {
55
+ if ( / [ \u00c0 - \u00d6 \u00d8 - \u024f \u3100 - \u312f ] / u. test ( str ) ) {
56
56
// 乱码标志 拉丁字母扩展 注音符号
57
57
results . push ( [ true , 2 , `拉丁字母扩展` ] )
58
58
}
59
- if ( / [ \u0530 - \u1cff ] / u. test ( str ) ) {
60
- // 乱码标志 小众语言字母符号
61
- results . push ( [ true , 3 , `小众语言符号 ` ] )
59
+ if ( / [ \u3300 - \u33ff ] / u. test ( str ) ) {
60
+ // 乱码标志 特殊字符
61
+ results . push ( [ true , 4 , `CJK特殊字符 ` ] )
62
62
}
63
- if ( / [ \u3300 - \u3357 ] / u. test ( str ) ) {
64
- // 乱码标志 方块片假名
65
- results . push ( [ true , 4 , `方块片假名` ] )
63
+ if ( / [ \u0370 - \u1cff ] / u. test ( str ) ) {
64
+ // 乱码标志 小众语言符号
65
+ results . push ( [ true , 3 , `小众语言A` ] )
66
+ }
67
+ if ( / [ \ua000 - \ua7ff \uab30 - \uabff \ud7b0 - \ud7ff ] / u. test ( str ) ) {
68
+ // 乱码标志 小众语言符号
69
+ results . push ( [ true , 4 , `小众语言B` ] )
70
+ }
71
+ if ( / [ \ud800 - \udfff ] / u. test ( str ) ) {
72
+ // 乱码标志 代理对,存疑
73
+ results . push ( [ true , 4 , `代理对` ] )
66
74
}
67
75
if ( / [ \ue000 - \uf8ff ] / u. test ( str ) ) {
68
76
// 乱码标志 Unicode私有区
69
77
results . push ( [ true , 5 , `私有区` ] )
70
78
}
71
79
if ( / [ \uff66 - \uff9d ] / u. test ( str ) ) {
80
+ // 暂时忽略,还比较常用
72
81
// 乱码标志 半角平假名片假名
73
- results . push ( [ true , 6 , `半角假名` ] )
82
+ // results.push([true, 6, `半角假名`])
74
83
}
75
84
if ( / [ 㼿 ] / u. test ( str ) ) {
76
85
// 乱码标志 特殊生僻字
@@ -113,6 +122,8 @@ export function fixCJKEncImpl(str,
113
122
// results.push([str, false, 0])
114
123
return [ [ str , false , 0 , '全英文数字' , '' ] , ]
115
124
}
125
+ log . info ( '---------------------' )
126
+ log . info ( 'fixCJKEnc' , str )
116
127
if ( ! REGEX_MESSY_UNICODE . test ( str )
117
128
&& ! REGEX_MESSY_CJK . test ( str )
118
129
&& ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
@@ -152,14 +163,15 @@ export function fixCJKEncImpl(str,
152
163
let strDecoded = iconv . decode ( strBuffer , enc2 )
153
164
const badDecoded = checkBadUnicode ( strDecoded )
154
165
// const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
166
+ log . info ( enc1 , enc2 , strDecoded , badDecoded )
155
167
// 如果含有乱码字符
156
168
if ( badDecoded ?. length > 0 ) {
157
169
for ( const item of badDecoded ) {
158
170
results . push ( [ strDecoded , ...item , `${ enc1 } =>${ enc2 } ` ] )
159
171
}
172
+
160
173
continue ;
161
174
}
162
-
163
175
// log.showRed('========')
164
176
// log.showRed(str)
165
177
// log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
@@ -168,8 +180,16 @@ export function fixCJKEncImpl(str,
168
180
const onlyASCII = strOnlyASCII ( strDecoded )
169
181
const onlyCN = strOnlyChinese ( strDecoded )
170
182
const onlyJP = strOnlyJapanese ( strDecoded )
183
+ const onlyJPHan = strOnlyJapaneseHan ( strDecoded )
184
+ const hasHiraKana = strHasHiraKana ( strDecoded )
185
+ const hasHFHiraKana = strHasHFKanaHira ( strDecoded )
171
186
const messyUnicode = REGEX_MESSY_UNICODE . test ( strDecoded )
172
187
const messyCJK = REGEX_MESSY_CJK . test ( strDecoded )
188
+ const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( strDecoded )
189
+
190
+ log . debug ( strDecoded , onlyASCII , onlyCN , onlyJP , onlyJPHan , messyCJK )
191
+ log . debug ( strDecoded , hasHiraKana , hasHFHiraKana , messyUnicode , messyCJK )
192
+
173
193
if ( onlyASCII && ! strDecoded . includes ( '?' ) ) {
174
194
results . push ( [ strDecoded , true , 99 , `全英文数字` , `${ enc1 } =>${ enc2 } ` ] )
175
195
break
@@ -178,25 +198,21 @@ export function fixCJKEncImpl(str,
178
198
results . push ( [ strDecoded , true , 99 , `常用汉字` , `${ enc1 } =>${ enc2 } ` ] )
179
199
break
180
200
}
181
- log . debug ( strDecoded , onlyCN , onlyJP , messyUnicode , messyCJK )
201
+ if ( messyCJK || messyCJKExt ) {
202
+ results . push ( [ strDecoded , true , 50 , `CJK罕见` , `${ enc1 } =>${ enc2 } ` ] )
182
203
183
- if ( onlyJP && strHasHiraKana ( strDecoded ) ) {
184
- results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
204
+ }
205
+ if ( onlyJP ) {
206
+ if ( strHasHiraKana ( strDecoded ) || onlyJPHan ) {
207
+ results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
208
+ }
185
209
}
186
210
else if ( onlyCN ) {
187
211
results . push ( [ strDecoded , true , 76 , `中文字符` , `${ enc1 } =>${ enc2 } ` ] )
188
212
}
189
- else if ( ! messyUnicode && ! messyCJK
190
- && [ enc1 , enc2 ] . includes ( 'SHIFT_JIS' )
191
- && [ enc1 , enc2 ] . includes ( 'UTF8' ) ) {
192
- results . push ( [ strDecoded , true , 74 , `无特殊字符` , `${ enc1 } =>${ enc2 } ` ] )
213
+ else if ( strHasHFKanaHira || strHasHiraKana ) {
214
+ results . push ( [ strDecoded , true , 65 , `含日文假名` , ` ${ enc1 } =>${ enc2 } ` ] )
193
215
}
194
- // else if (messyCJK) {
195
- // results.push([strDecoded, true, 51, `含特殊汉字`, `${enc1}=>${enc2}`])
196
- // }
197
- // else if (messyUnicode) {
198
- // results.push([strDecoded, true, 52, `含特殊符号`, `${enc1}=>${enc2}`])
199
- // }
200
216
else {
201
217
results . push ( [ strDecoded , true , 60 , `正常转换 ${ onlyCN } ${ onlyJP } ` , ` ${ enc1 } =>${ enc2 } ` ] )
202
218
}
0 commit comments