Skip to content

Commit 6926b45

Browse files
committed
add cmd decode (guess encoding)
1 parent a4a5544 commit 6926b45

File tree

6 files changed

+167
-153
lines changed

6 files changed

+167
-153
lines changed

cmd/cmd_decode.js

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
2+
import chardet from 'chardet';
3+
import * as log from '../lib/debug.js';
4+
import * as enc from '../lib/encoding.js';
5+
import * as unicode from '../lib/unicode.js';
6+
7+
8+
const ENC_LIST = [
9+
'ISO-8859-1',
10+
'UTF8',
11+
'UTF-16',
12+
'GBK',
13+
'BIG5',
14+
'SHIFT_JIS',
15+
'EUC-JP',
16+
'EUC-KR',
17+
]
18+
19+
export { aliases, builder, command, describe, handler };
20+
const command = "decode <strings...>"
21+
const aliases = ["dc"]
22+
const describe = 'Decode text with messy or invalid chars'
23+
24+
const builder = function addOptions(ya, helpOrVersionSet) {
25+
return ya
26+
.positional('strings', {
27+
describe: 'string list to decode',
28+
type: 'string',
29+
})
30+
// 修复文件名乱码
31+
.option("from-enc", {
32+
alias: "f",
33+
type: "choices",
34+
choices: ['utf8', 'gbk', 'shift_jis', 'big5', 'euc-kr'],
35+
description: "from encoding name eg. utf8|gbk|shift_jis",
36+
})
37+
.option("to-enc", {
38+
alias: "t",
39+
type: "choices",
40+
choices: ['utf8', 'gbk', 'shift_jis', 'big5', 'euc-kr'],
41+
description: "to encoding name tg. utf8|gbk|shift_jis",
42+
}).po
43+
}
44+
45+
const handler = async function cmdDecode(argv) {
46+
const logTag = "cmdDecode";
47+
log.info(logTag, 'Args:', argv);
48+
const strArgs = argv.strings;
49+
if (strArgs?.length === 0) {
50+
throw new Error(`text input required`);
51+
}
52+
const fromEnc = argv.fromEnc?.length > 0 ? [argv.fromEnc] : ENC_LIST;
53+
const toEnc = argv.toEnc?.length > 0 ? [argv.toEnc] : ENC_LIST;
54+
const threhold = log.isVerbose() ? 1 : 50;
55+
log.show(logTag, `Input:`, strArgs)
56+
log.show(logTag, `fromEnc:`, JSON.stringify(fromEnc))
57+
log.show(logTag, `toEnc:`, JSON.stringify(toEnc))
58+
59+
for (const str of strArgs) {
60+
log.show(logTag, 'TryDecoding:', [str])
61+
const results = decodeText(str, fromEnc, toEnc, threhold)
62+
results.forEach(showResults)
63+
log.show('INPUT:', [str, str.length],)
64+
log.show('OUPUT:', results.pop())
65+
console.log()
66+
}
67+
}
68+
69+
function decodeText(str, fromEnc = ENC_LIST, toEnc = ENC_LIST, threhold = 50) {
70+
let results = enc.tryDecodeText(str, fromEnc, toEnc, threhold)
71+
return results.reverse()
72+
}
73+
74+
function showResults(r) {
75+
log.info(`-`)
76+
const str = r[0]
77+
const print = (a, b) => log.info(a.padEnd(16, ' '), b)
78+
log.show('Result:', str.padEnd(16, ' '), r.slice(1))
79+
let cr = chardet.analyse(Buffer.from(str))
80+
cr = cr.filter(ct => ct.confidence >= 70)
81+
cr?.length > 0 && print('Encoding', cr)
82+
print('String', Array.from(str))
83+
print('Unicode', Array.from(str).map(c => c.codePointAt(0).toString(16)))
84+
const badUnicode = enc.checkBadUnicode(str)
85+
badUnicode?.length > 0 && log.info(`badUnicode=true`)
86+
log.info(`MESSY_UNICODE=${enc.REGEX_MESSY_UNICODE.test(str)}`,
87+
`MESSY_CJK=${enc.REGEX_MESSY_CJK.test(str)}`,
88+
`MESSY_CJK_EXT=${enc.REGEX_MESSY_CJK_EXT.test(str)}`)
89+
log.info(`OnlyJapanese=${unicode.strOnlyJapanese(str)}`,
90+
`OnlyJpHan=${unicode.strOnlyJapaneseHan(str)}`,
91+
`HasHiraKana=${unicode.strHasHiraKana(str)}`
92+
)
93+
log.info(`HasHangul=${unicode.strHasHangul(str)}`,
94+
`OnlyHangul=${unicode.strOnlyHangul(str)}`)
95+
log.info(`HasChinese=${unicode.strHasChinese(str)}`,
96+
`OnlyChinese=${unicode.strOnlyChinese(str)}`,
97+
`OnlyChn3500=${enc.RE_CHARS_MOST_USED.test(str)}`)
98+
}

cmd/cmd_fixname.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,11 +187,11 @@ async function fixFileName(f) {
187187
if (argv.encoding) {
188188
// 执行文件路径乱码修复操作
189189
// 对路径进行中日韩文字编码修复
190-
let [fs, ft] = enc.fixCJKEnc(oldBase);
190+
let [fs, ft] = enc.decodeText(oldBase);
191191
oldBase = fs.trim();
192192
// 将目录路径分割,并对每个部分进行编码修复
193193
const dirNamesFixed = oldDir.split(path.sep).map(s => {
194-
let [rs, rt] = enc.fixCJKEnc(s)
194+
let [rs, rt] = enc.decodeText(s)
195195
return rs.trim();
196196
});
197197
// 重新组合修复后的目录路径

lib/encoding.js

Lines changed: 55 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
import dayjs from 'dayjs';
2-
import fs from 'fs-extra';
1+
32
import iconv from 'iconv-lite';
4-
import os from 'os';
5-
import path from 'path';
63
import * as log from './debug.js';
7-
import { strHasASCII, strHasHFKanaHira, strHasHiraKana, strOnlyASCII, strOnlyChinese, strOnlyJapanese, strOnlyJapaneseHan } from './unicode.js';
4+
import { strHasASCII, strHasHFKanaHira, strHasHiraKana, strOnlyASCII, strOnlyChinese, strOnlyHangul, strOnlyJapanese, strOnlyJapaneseHan } from './unicode.js';
85
import { CHINESE_CHARS_3500, MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js';
96

10-
// https://github.com/bnoordhuis/node-iconv/
7+
// https://github.com/bnoordhuis/node-iconv/
118
const ENCODING_FROM = [
129
'SHIFT_JIS',
1310
'GBK',
@@ -26,8 +23,6 @@ const ENCODING_TO = [
2623
// 'EUC-KR',
2724
]
2825

29-
const ENCODING_TRY = ['SHIFT_JIS', 'UTF8']
30-
3126
export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
3227

3328
export const REGEX_MESSY_CJK = new RegExp(`[${MESSY_CJK_CHARS}]`, 'u')
@@ -43,9 +38,6 @@ export function charUnique(str) {
4338
return String.prototype.concat.call(...new Set(str));
4439
}
4540

46-
const nowDateStr = dayjs().format("YYYYMMDDHHmmss");
47-
const tempfile = path.join(os.tmpdir(), `z_mediac_log_${nowDateStr}.txt`)
48-
4941
export function checkBadUnicode(str) {
5042
const results = []
5143
if (str.includes('?') || str.includes('\ufffd')) {
@@ -76,6 +68,10 @@ export function checkBadUnicode(str) {
7668
// 乱码标志 Unicode私有区
7769
results.push([true, 5, `私有区`])
7870
}
71+
if (/[\ufb50-\ufdff\ufe70-\ufeff]/u.test(str)) {
72+
// 乱码标志 阿拉伯字符
73+
results.push([true, 5, `阿拉伯字符`])
74+
}
7975
if (/[\uff66-\uff9d]/u.test(str)) {
8076
// 暂时忽略,还比较常用
8177
// 乱码标志 半角平假名片假名
@@ -96,8 +92,8 @@ export function hasBadCJKChar(str) {
9692
return REGEX_MESSY_CJK.test(str) || REGEX_MESSY_CJK_EXT.test(str)
9793
}
9894

99-
export function fixCJKEnc(str) {
100-
let results = fixCJKEncImpl(str)
95+
export function decodeText(str) {
96+
let results = tryDecodeText(str)
10197
results = results.filter(r => r[2] >= 0).sort((a, b) => b[2] - a[2])
10298
log.debug('==================================')
10399
log.debug(str)
@@ -109,110 +105,110 @@ export function fixCJKEnc(str) {
109105
return results[0] || [str, false, 0, 'fallback'];
110106
}
111107

112-
export function fixCJKEncImpl(str,
108+
export function tryDecodeText(str,
113109
fromEnc = ENCODING_FROM,
114110
toEnc = ENCODING_TO,
115111
threhold = 10) {
116112
if (str.includes('?') || str.includes('\ufffd')) {
117-
return [[str, false, 0, '信息丢失', ''],]
113+
return [[str, false, 0, '[乱码字符]'],]
118114
}
119115

116+
fromEnc = fromEnc.map(x => x.toLowerCase())
117+
toEnc = toEnc.map(x => x.toLowerCase())
118+
120119
let results = []
121120
if (strOnlyASCII(str)) {
122121
// results.push([str, false, 0])
123-
return [[str, false, 0, '全英文数字', ''],]
122+
return [[str, false, 100, '[ASCII]'],]
124123
}
124+
const messyUnicode = REGEX_MESSY_UNICODE.test(str)
125+
const messyCJK = REGEX_MESSY_CJK.test(str)
126+
const messyCJKExt = REGEX_MESSY_CJK_EXT.test(str)
125127
log.info('---------------------')
126-
log.info('fixCJKEnc', str)
127-
if (!REGEX_MESSY_UNICODE.test(str)
128-
&& !REGEX_MESSY_CJK.test(str)
129-
&& !REGEX_MESSY_CJK_EXT.test(str)) {
130-
if (RE_CHARS_MOST_USED.test(str)) {
131-
results.push([str, false, 100, '常用汉字0', ''])
132-
}
133-
// else if (strOnlyChinese(str)) {
134-
// results.push([str, false, 99, '全中文01', ''])
135-
// }
136-
else if (strHasHFKanaHira(str)) {
137-
// 包含不用的全角半角平假名片假名
138-
results.push([str, false, 65, '含半角假名0', ''])
139-
}
140-
else {
141-
// fs.appendFileSync(tempfile, str + '\n')
142-
return [[str, false, 0, '忽略0', ''],]
143-
}
144-
} else {
128+
log.info('tryDecodeText', str)
129+
if (messyUnicode || messyCJK || messyCJKExt) {
145130
if (strOnlyChinese(str) && !REGEX_MESSY_CJK_EXT.test(str)) {
146-
return [[str, false, 0, `全中文02`, `${REGEX_MESSY_UNICODE.test(str)}`],]
131+
return [[str, false, 100, `[全中文]`],]
147132
}
148133
}
149-
if ((strHasHiraKana(str) || strHasASCII(str))
150-
&& strOnlyJapanese(str) && !REGEX_MESSY_CJK.test(str)) {
151-
results.push([str, false, 99, '全日文01', ''])
134+
if (RE_CHARS_MOST_USED.test(str)) {
135+
results.push([str, false, 100, '[常用汉字]'])
136+
}
137+
else if (strHasHFKanaHira(str)) {
138+
// 包含不用的全角半角平假名片假名
139+
results.push([str, false, 65, '[半角假名]'])
140+
}
141+
else {
142+
// fs.appendFileSync(tempfile, str + '\n')
143+
return [[str, false, 0, '[无乱码]', ''],]
144+
}
145+
146+
if (!!REGEX_MESSY_CJK.test(str)
147+
&& (strHasHiraKana(str) || strHasASCII(str))
148+
&& strOnlyJapanese(str)) {
149+
results.push([str, false, 99, '[全日文1]'])
152150
}
153151
else if (strOnlyJapanese(str)) {
154-
results.push([str, false, 80, '全日文02', ''])
152+
results.push([str, false, 80, '[全日文2]'])
155153
}
156-
// log.showRed(str)
157-
// log.show(Array.from(str).map(c => c.codePointAt(0).toString(16)).join(' '))
154+
158155
for (const enc1 of fromEnc) {
159156
for (const enc2 of toEnc) {
157+
// 忽略解码编码相同的情况
160158
if (enc1 === enc2) { continue }
161159
try {
162160
const strBuffer = iconv.encode(str, enc1)
163161
let strDecoded = iconv.decode(strBuffer, enc2)
164162
const badDecoded = checkBadUnicode(strDecoded)
165163
// const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
166-
log.info(enc1, enc2, strDecoded, badDecoded)
164+
log.debug(enc1, enc2, strDecoded, badDecoded)
167165
// 如果含有乱码字符
168166
if (badDecoded?.length > 0) {
169167
for (const item of badDecoded) {
170168
results.push([strDecoded, ...item, `${enc1}=>${enc2}`])
171169
}
172-
173170
continue;
174171
}
175-
// log.showRed('========')
176-
// log.showRed(str)
177-
// log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
178-
// log.show(strDecoded, enc1, enc2)
179-
180172
const onlyASCII = strOnlyASCII(strDecoded)
181173
const onlyCN = strOnlyChinese(strDecoded)
182174
const onlyJP = strOnlyJapanese(strDecoded)
183175
const onlyJPHan = strOnlyJapaneseHan(strDecoded)
176+
const onlyKR = strOnlyHangul(strDecoded)
184177
const hasHiraKana = strHasHiraKana(strDecoded)
185178
const hasHFHiraKana = strHasHFKanaHira(strDecoded)
186179
const messyUnicode = REGEX_MESSY_UNICODE.test(strDecoded)
187180
const messyCJK = REGEX_MESSY_CJK.test(strDecoded)
188181
const messyCJKExt = REGEX_MESSY_CJK_EXT.test(strDecoded)
189182

190-
log.debug(strDecoded, onlyASCII, onlyCN, onlyJP, onlyJPHan, messyCJK)
191-
log.debug(strDecoded, hasHiraKana, hasHFHiraKana, messyUnicode, messyCJK)
183+
log.debug(strDecoded, 'cn', onlyCN, 'jp', onlyJP, 'jhan', onlyJPHan, 'kr', onlyKR)
184+
log.debug(strDecoded, 'hk', hasHiraKana, 'hf', hasHFHiraKana, 'mu', messyUnicode, 'mc', messyCJK)
192185

193186
if (onlyASCII && !strDecoded.includes('?')) {
194-
results.push([strDecoded, true, 99, `全英文数字`, `${enc1}=>${enc2}`])
187+
results.push([strDecoded, true, 99, `ASCII`, `${enc1}=>${enc2}`])
195188
break
196189
}
197190
if (RE_CHARS_MOST_USED.test(strDecoded)) {
198191
results.push([strDecoded, true, 99, `常用汉字`, `${enc1}=>${enc2}`])
199192
break
200193
}
201-
if (messyCJK || messyCJKExt) {
202-
results.push([strDecoded, true, 50, `CJK罕见`, `${enc1}=>${enc2}`])
203-
204-
}
205194
if (onlyJP) {
206-
if (strHasHiraKana(strDecoded) || onlyJPHan) {
195+
if (!strHasHiraKana(strDecoded) && !onlyJPHan) {
207196
results.push([strDecoded, true, 78, `日文字符`, `${enc1}=>${enc2}`])
208197
}
209198
}
210199
else if (onlyCN) {
211200
results.push([strDecoded, true, 76, `中文字符`, `${enc1}=>${enc2}`])
212201
}
213-
else if (strHasHFKanaHira || strHasHiraKana) {
202+
else if (hasHiraKana || hasHFHiraKana) {
214203
results.push([strDecoded, true, 65, `含日文假名`, ` ${enc1}=>${enc2}`])
215204
}
205+
else if (onlyKR) {
206+
results.push([strDecoded, true, 62, `韩文字符`, `${enc1}=>${enc2}`])
207+
}
208+
else if (messyCJK || messyCJKExt) {
209+
results.push([strDecoded, true, 51, `生僻字`, `${enc1}=>${enc2}`])
210+
// continue
211+
}
216212
else {
217213
results.push([strDecoded, true, 60, `正常转换 ${onlyCN} ${onlyJP}`, ` ${enc1}=>${enc2}`])
218214
}
@@ -222,7 +218,7 @@ export function fixCJKEncImpl(str,
222218
}
223219
}
224220
}
225-
results.push([str, false, 70, '原始字符串'])
221+
results.push([str, false, 70, '原始值'])
226222
results = results.filter(r => r[2] >= threhold).sort((a, b) => b[2] - a[2])
227223
log.debug(results.slice(3))
228224
return results;

lib/unicode.js

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ export const strHasJapanese = (str) => REGEX_JAPANESE.test(str);
4444
export const strOnlyJapanese = (str) => REGEX_JAPANESE_ONLY.test(str);
4545

4646
// 导出一个正则表达式,用于判断字符串中是否包含平假名或片假名
47-
export const REGEX_HAS_HIRA_OR_KANA = /[\p{sc=Hira}\p{sc=Kana}]/u;
47+
export const REGEX_HAS_HIRA_OR_KANA = /[\u3040-\u30ff}]/u;
4848
/**
4949
* 判断给定字符串中是否包含平假名或片假名
5050
* @param {string} str 需要进行判断的字符串
@@ -53,7 +53,7 @@ export const REGEX_HAS_HIRA_OR_KANA = /[\p{sc=Hira}\p{sc=Kana}]/u;
5353
export const strHasHiraKana = (str) => REGEX_HAS_HIRA_OR_KANA.test(str);
5454

5555
// 导出一个正则表达式,用于判断字符串是否只包含平假名或片假名
56-
export const REGEX_ONLY_HIRA_OR_KANA = /^[\p{sc=Hira}\p{sc=Kana}]+$/u;
56+
export const REGEX_ONLY_HIRA_OR_KANA = /^[\u3040-\u30ff]+$/u;
5757
/**
5858
* 判断给定字符串是否只包含平假名或片假名
5959
* @param {string} str 需要进行判断的字符串
@@ -111,7 +111,7 @@ export const REGEX_UNICODE_HAN_ONLY = /^[\p{sc=Hani}]+$/u;
111111
*/
112112
export const strOnlyHani = (str) => REGEX_UNICODE_HAN_ONLY.test(str);
113113

114-
// 日文半角和全角平假名片假名,一半不会用
114+
// 日文半角和全角平假名片假名,一般不会用
115115
export const REGEX_HF_KANA_HIRA = /[\uff66-\uff9d]/u;
116116
export const strHasHFKanaHira = (str) => REGEX_HF_KANA_HIRA.test(str);
117117

@@ -148,7 +148,7 @@ export const strHasChineseHan3500 = (str) => REGEX_CHINESE_HAN_3500_ANY.test(str
148148
//-------------------------------------------------------------------------------
149149

150150
// 定义一个正则表达式,用于匹配包含任何Unicode朝鲜语字符的字符串
151-
export const REGEX_HAS_HANGUL = /[\p{sc=Hang}]/u;
151+
export const REGEX_HAS_HANGUL = /[\p{sc=Hangul}]/u;
152152
/**
153153
* 检查字符串中是否包含朝鲜语字符
154154
* @param {string} str 需要检查的字符串
@@ -157,7 +157,7 @@ export const REGEX_HAS_HANGUL = /[\p{sc=Hang}]/u;
157157
export const strHasHangul = (str) => REGEX_HAS_HANGUL.test(str);
158158

159159
// 定义一个正则表达式,用于匹配仅包含Unicode朝鲜语字符的字符串
160-
export const REGEX_ONLY_HANGUL = /^[\p{sc=Hang}]+$/u;
160+
export const REGEX_ONLY_HANGUL = /^[ A-Za-z0-9_\.-\\u00a1-\\u00b7\p{sc=Hangul}]+$/u;
161161
/**
162162
* 检查字符串是否仅由朝鲜语字符组成
163163
* @param {string} str 需要检查的字符串

0 commit comments

Comments
 (0)