1
- import dayjs from 'dayjs' ;
2
- import fs from 'fs-extra' ;
1
+
3
2
import iconv from 'iconv-lite' ;
4
- import os from 'os' ;
5
- import path from 'path' ;
6
3
import * as log from './debug.js' ;
7
- import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
4
+ import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyHangul , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
8
5
import { CHINESE_CHARS_3500 , MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js' ;
9
6
10
- // https://github.com/bnoordhuis/node-iconv/
7
+ // https://github.com/bnoordhuis/node-iconv/
11
8
const ENCODING_FROM = [
12
9
'SHIFT_JIS' ,
13
10
'GBK' ,
@@ -26,8 +23,6 @@ const ENCODING_TO = [
26
23
// 'EUC-KR',
27
24
]
28
25
29
- const ENCODING_TRY = [ 'SHIFT_JIS' , 'UTF8' ]
30
-
31
26
export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
32
27
33
28
export const REGEX_MESSY_CJK = new RegExp ( `[${ MESSY_CJK_CHARS } ]` , 'u' )
@@ -43,9 +38,6 @@ export function charUnique(str) {
43
38
return String . prototype . concat . call ( ...new Set ( str ) ) ;
44
39
}
45
40
46
- const nowDateStr = dayjs ( ) . format ( "YYYYMMDDHHmmss" ) ;
47
- const tempfile = path . join ( os . tmpdir ( ) , `z_mediac_log_${ nowDateStr } .txt` )
48
-
49
41
export function checkBadUnicode ( str ) {
50
42
const results = [ ]
51
43
if ( str . includes ( '?' ) || str . includes ( '\ufffd' ) ) {
@@ -76,6 +68,10 @@ export function checkBadUnicode(str) {
76
68
// 乱码标志 Unicode私有区
77
69
results . push ( [ true , 5 , `私有区` ] )
78
70
}
71
+ if ( / [ \ufb50 - \ufdff \ufe70 - \ufeff ] / u. test ( str ) ) {
72
+ // 乱码标志 阿拉伯字符
73
+ results . push ( [ true , 5 , `阿拉伯字符` ] )
74
+ }
79
75
if ( / [ \uff66 - \uff9d ] / u. test ( str ) ) {
80
76
// 暂时忽略,还比较常用
81
77
// 乱码标志 半角平假名片假名
@@ -96,8 +92,8 @@ export function hasBadCJKChar(str) {
96
92
return REGEX_MESSY_CJK . test ( str ) || REGEX_MESSY_CJK_EXT . test ( str )
97
93
}
98
94
99
- export function fixCJKEnc ( str ) {
100
- let results = fixCJKEncImpl ( str )
95
+ export function decodeText ( str ) {
96
+ let results = tryDecodeText ( str )
101
97
results = results . filter ( r => r [ 2 ] >= 0 ) . sort ( ( a , b ) => b [ 2 ] - a [ 2 ] )
102
98
log . debug ( '==================================' )
103
99
log . debug ( str )
@@ -109,110 +105,110 @@ export function fixCJKEnc(str) {
109
105
return results [ 0 ] || [ str , false , 0 , 'fallback' ] ;
110
106
}
111
107
112
- export function fixCJKEncImpl ( str ,
108
+ export function tryDecodeText ( str ,
113
109
fromEnc = ENCODING_FROM ,
114
110
toEnc = ENCODING_TO ,
115
111
threhold = 10 ) {
116
112
if ( str . includes ( '?' ) || str . includes ( '\ufffd' ) ) {
117
- return [ [ str , false , 0 , '信息丢失' , ' '] , ]
113
+ return [ [ str , false , 0 , '[乱码字符] ' ] , ]
118
114
}
119
115
116
+ fromEnc = fromEnc . map ( x => x . toLowerCase ( ) )
117
+ toEnc = toEnc . map ( x => x . toLowerCase ( ) )
118
+
120
119
let results = [ ]
121
120
if ( strOnlyASCII ( str ) ) {
122
121
// results.push([str, false, 0])
123
- return [ [ str , false , 0 , '全英文数字' , ' '] , ]
122
+ return [ [ str , false , 100 , '[ASCII] ' ] , ]
124
123
}
124
+ const messyUnicode = REGEX_MESSY_UNICODE . test ( str )
125
+ const messyCJK = REGEX_MESSY_CJK . test ( str )
126
+ const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( str )
125
127
log . info ( '---------------------' )
126
- log . info ( 'fixCJKEnc' , str )
127
- if ( ! REGEX_MESSY_UNICODE . test ( str )
128
- && ! REGEX_MESSY_CJK . test ( str )
129
- && ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
130
- if ( RE_CHARS_MOST_USED . test ( str ) ) {
131
- results . push ( [ str , false , 100 , '常用汉字0' , '' ] )
132
- }
133
- // else if (strOnlyChinese(str)) {
134
- // results.push([str, false, 99, '全中文01', ''])
135
- // }
136
- else if ( strHasHFKanaHira ( str ) ) {
137
- // 包含不用的全角半角平假名片假名
138
- results . push ( [ str , false , 65 , '含半角假名0' , '' ] )
139
- }
140
- else {
141
- // fs.appendFileSync(tempfile, str + '\n')
142
- return [ [ str , false , 0 , '忽略0' , '' ] , ]
143
- }
144
- } else {
128
+ log . info ( 'tryDecodeText' , str )
129
+ if ( messyUnicode || messyCJK || messyCJKExt ) {
145
130
if ( strOnlyChinese ( str ) && ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
146
- return [ [ str , false , 0 , `全中文02` , ` ${ REGEX_MESSY_UNICODE . test ( str ) } `] , ]
131
+ return [ [ str , false , 100 , `[全中文] ` ] , ]
147
132
}
148
133
}
149
- if ( ( strHasHiraKana ( str ) || strHasASCII ( str ) )
150
- && strOnlyJapanese ( str ) && ! REGEX_MESSY_CJK . test ( str ) ) {
151
- results . push ( [ str , false , 99 , '全日文01' , '' ] )
134
+ if ( RE_CHARS_MOST_USED . test ( str ) ) {
135
+ results . push ( [ str , false , 100 , '[常用汉字]' ] )
136
+ }
137
+ else if ( strHasHFKanaHira ( str ) ) {
138
+ // 包含不用的全角半角平假名片假名
139
+ results . push ( [ str , false , 65 , '[半角假名]' ] )
140
+ }
141
+ else {
142
+ // fs.appendFileSync(tempfile, str + '\n')
143
+ return [ [ str , false , 0 , '[无乱码]' , '' ] , ]
144
+ }
145
+
146
+ if ( ! ! REGEX_MESSY_CJK . test ( str )
147
+ && ( strHasHiraKana ( str ) || strHasASCII ( str ) )
148
+ && strOnlyJapanese ( str ) ) {
149
+ results . push ( [ str , false , 99 , '[全日文1]' ] )
152
150
}
153
151
else if ( strOnlyJapanese ( str ) ) {
154
- results . push ( [ str , false , 80 , '全日文02' , ' '] )
152
+ results . push ( [ str , false , 80 , '[全日文2] ' ] )
155
153
}
156
- // log.showRed(str)
157
- // log.show(Array.from(str).map(c => c.codePointAt(0).toString(16)).join(' '))
154
+
158
155
for ( const enc1 of fromEnc ) {
159
156
for ( const enc2 of toEnc ) {
157
+ // 忽略解码编码相同的情况
160
158
if ( enc1 === enc2 ) { continue }
161
159
try {
162
160
const strBuffer = iconv . encode ( str , enc1 )
163
161
let strDecoded = iconv . decode ( strBuffer , enc2 )
164
162
const badDecoded = checkBadUnicode ( strDecoded )
165
163
// const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
166
- log . info ( enc1 , enc2 , strDecoded , badDecoded )
164
+ log . debug ( enc1 , enc2 , strDecoded , badDecoded )
167
165
// 如果含有乱码字符
168
166
if ( badDecoded ?. length > 0 ) {
169
167
for ( const item of badDecoded ) {
170
168
results . push ( [ strDecoded , ...item , `${ enc1 } =>${ enc2 } ` ] )
171
169
}
172
-
173
170
continue ;
174
171
}
175
- // log.showRed('========')
176
- // log.showRed(str)
177
- // log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
178
- // log.show(strDecoded, enc1, enc2)
179
-
180
172
const onlyASCII = strOnlyASCII ( strDecoded )
181
173
const onlyCN = strOnlyChinese ( strDecoded )
182
174
const onlyJP = strOnlyJapanese ( strDecoded )
183
175
const onlyJPHan = strOnlyJapaneseHan ( strDecoded )
176
+ const onlyKR = strOnlyHangul ( strDecoded )
184
177
const hasHiraKana = strHasHiraKana ( strDecoded )
185
178
const hasHFHiraKana = strHasHFKanaHira ( strDecoded )
186
179
const messyUnicode = REGEX_MESSY_UNICODE . test ( strDecoded )
187
180
const messyCJK = REGEX_MESSY_CJK . test ( strDecoded )
188
181
const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( strDecoded )
189
182
190
- log . debug ( strDecoded , onlyASCII , onlyCN , onlyJP , onlyJPHan , messyCJK )
191
- log . debug ( strDecoded , hasHiraKana , hasHFHiraKana , messyUnicode , messyCJK )
183
+ log . debug ( strDecoded , 'cn' , onlyCN , 'jp' , onlyJP , 'jhan' , onlyJPHan , 'kr' , onlyKR )
184
+ log . debug ( strDecoded , 'hk' , hasHiraKana , 'hf' , hasHFHiraKana , 'mu' , messyUnicode , 'mc' , messyCJK )
192
185
193
186
if ( onlyASCII && ! strDecoded . includes ( '?' ) ) {
194
- results . push ( [ strDecoded , true , 99 , `全英文数字 ` , `${ enc1 } =>${ enc2 } ` ] )
187
+ results . push ( [ strDecoded , true , 99 , `ASCII ` , `${ enc1 } =>${ enc2 } ` ] )
195
188
break
196
189
}
197
190
if ( RE_CHARS_MOST_USED . test ( strDecoded ) ) {
198
191
results . push ( [ strDecoded , true , 99 , `常用汉字` , `${ enc1 } =>${ enc2 } ` ] )
199
192
break
200
193
}
201
- if ( messyCJK || messyCJKExt ) {
202
- results . push ( [ strDecoded , true , 50 , `CJK罕见` , `${ enc1 } =>${ enc2 } ` ] )
203
-
204
- }
205
194
if ( onlyJP ) {
206
- if ( strHasHiraKana ( strDecoded ) || onlyJPHan ) {
195
+ if ( ! strHasHiraKana ( strDecoded ) && ! onlyJPHan ) {
207
196
results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
208
197
}
209
198
}
210
199
else if ( onlyCN ) {
211
200
results . push ( [ strDecoded , true , 76 , `中文字符` , `${ enc1 } =>${ enc2 } ` ] )
212
201
}
213
- else if ( strHasHFKanaHira || strHasHiraKana ) {
202
+ else if ( hasHiraKana || hasHFHiraKana ) {
214
203
results . push ( [ strDecoded , true , 65 , `含日文假名` , ` ${ enc1 } =>${ enc2 } ` ] )
215
204
}
205
+ else if ( onlyKR ) {
206
+ results . push ( [ strDecoded , true , 62 , `韩文字符` , `${ enc1 } =>${ enc2 } ` ] )
207
+ }
208
+ else if ( messyCJK || messyCJKExt ) {
209
+ results . push ( [ strDecoded , true , 51 , `生僻字` , `${ enc1 } =>${ enc2 } ` ] )
210
+ // continue
211
+ }
216
212
else {
217
213
results . push ( [ strDecoded , true , 60 , `正常转换 ${ onlyCN } ${ onlyJP } ` , ` ${ enc1 } =>${ enc2 } ` ] )
218
214
}
@@ -222,7 +218,7 @@ export function fixCJKEncImpl(str,
222
218
}
223
219
}
224
220
}
225
- results . push ( [ str , false , 70 , '原始字符串 ' ] )
221
+ results . push ( [ str , false , 70 , '原始值 ' ] )
226
222
results = results . filter ( r => r [ 2 ] >= threhold ) . sort ( ( a , b ) => b [ 2 ] - a [ 2 ] )
227
223
log . debug ( results . slice ( 3 ) )
228
224
return results ;
0 commit comments