add cmd decode (guess encoding)

mcxiaoke · mcxiaoke · commit 6926b4519a8c · 2024-04-07T16:12:06.000+08:00
diff --git a/cmd/cmd_decode.js b/cmd/cmd_decode.js
@@ -0,0 +1,98 @@
+
+import chardet from 'chardet';
+import * as log from '../lib/debug.js';
+import * as enc from '../lib/encoding.js';
+import * as unicode from '../lib/unicode.js';
+
+
+const ENC_LIST = [
+    'ISO-8859-1',
+    'UTF8',
+    'UTF-16',
+    'GBK',
+    'BIG5',
+    'SHIFT_JIS',
+    'EUC-JP',
+    'EUC-KR',
+]
+
+export { aliases, builder, command, describe, handler };
+const command = "decode <strings...>"
+const aliases = ["dc"]
+const describe = 'Decode text with messy or invalid chars'
+
+const builder = function addOptions(ya, helpOrVersionSet) {
+    return ya
+        .positional('strings', {
+            describe: 'string list to decode',
+            type: 'string',
+        })
+        // 修复文件名乱码
+        .option("from-enc", {
+            alias: "f",
+            type: "choices",
+            choices: ['utf8', 'gbk', 'shift_jis', 'big5', 'euc-kr'],
+            description: "from encoding name eg. utf8|gbk|shift_jis",
+        })
+        .option("to-enc", {
+            alias: "t",
+            type: "choices",
+            choices: ['utf8', 'gbk', 'shift_jis', 'big5', 'euc-kr'],
+            description: "to encoding name tg. utf8|gbk|shift_jis",
+        }).po
+}
+
+const handler = async function cmdDecode(argv) {
+    const logTag = "cmdDecode";
+    log.info(logTag, 'Args:', argv);
+    const strArgs = argv.strings;
+    if (strArgs?.length === 0) {
+        throw new Error(`text input required`);
+    }
+    const fromEnc = argv.fromEnc?.length > 0 ? [argv.fromEnc] : ENC_LIST;
+    const toEnc = argv.toEnc?.length > 0 ? [argv.toEnc] : ENC_LIST;
+    const threhold = log.isVerbose() ? 1 : 50;
+    log.show(logTag, `Input:`, strArgs)
+    log.show(logTag, `fromEnc:`, JSON.stringify(fromEnc))
+    log.show(logTag, `toEnc:`, JSON.stringify(toEnc))
+
+    for (const str of strArgs) {
+        log.show(logTag, 'TryDecoding:', [str])
+        const results = decodeText(str, fromEnc, toEnc, threhold)
+        results.forEach(showResults)
+        log.show('INPUT:', [str, str.length],)
+        log.show('OUPUT:', results.pop())
+        console.log()
+    }
+}
+
+function decodeText(str, fromEnc = ENC_LIST, toEnc = ENC_LIST, threhold = 50) {
+    let results = enc.tryDecodeText(str, fromEnc, toEnc, threhold)
+    return results.reverse()
+}
+
+function showResults(r) {
+    log.info(`-`)
+    const str = r[0]
+    const print = (a, b) => log.info(a.padEnd(16, ' '), b)
+    log.show('Result:', str.padEnd(16, ' '), r.slice(1))
+    let cr = chardet.analyse(Buffer.from(str))
+    cr = cr.filter(ct => ct.confidence >= 70)
+    cr?.length > 0 && print('Encoding', cr)
+    print('String', Array.from(str))
+    print('Unicode', Array.from(str).map(c => c.codePointAt(0).toString(16)))
+    const badUnicode = enc.checkBadUnicode(str)
+    badUnicode?.length > 0 && log.info(`badUnicode=true`)
+    log.info(`MESSY_UNICODE=${enc.REGEX_MESSY_UNICODE.test(str)}`,
+        `MESSY_CJK=${enc.REGEX_MESSY_CJK.test(str)}`,
+        `MESSY_CJK_EXT=${enc.REGEX_MESSY_CJK_EXT.test(str)}`)
+    log.info(`OnlyJapanese=${unicode.strOnlyJapanese(str)}`,
+        `OnlyJpHan=${unicode.strOnlyJapaneseHan(str)}`,
+        `HasHiraKana=${unicode.strHasHiraKana(str)}`
+    )
+    log.info(`HasHangul=${unicode.strHasHangul(str)}`,
+        `OnlyHangul=${unicode.strOnlyHangul(str)}`)
+    log.info(`HasChinese=${unicode.strHasChinese(str)}`,
+        `OnlyChinese=${unicode.strOnlyChinese(str)}`,
+        `OnlyChn3500=${enc.RE_CHARS_MOST_USED.test(str)}`)
+}
diff --git a/cmd/cmd_fixname.js b/cmd/cmd_fixname.js
@@ -187,11 +187,11 @@ async function fixFileName(f) {
     if (argv.encoding) {
         // 执行文件路径乱码修复操作
         // 对路径进行中日韩文字编码修复
-        let [fs, ft] = enc.fixCJKEnc(oldBase);
+        let [fs, ft] = enc.decodeText(oldBase);
         oldBase = fs.trim();
         // 将目录路径分割，并对每个部分进行编码修复
         const dirNamesFixed = oldDir.split(path.sep).map(s => {
-            let [rs, rt] = enc.fixCJKEnc(s)
+            let [rs, rt] = enc.decodeText(s)
             return rs.trim();
         });
         // 重新组合修复后的目录路径
diff --git a/lib/encoding.js b/lib/encoding.js
@@ -1,13 +1,10 @@
-import dayjs from 'dayjs';
-import fs from 'fs-extra';
+
 import iconv from 'iconv-lite';
-import os from 'os';
-import path from 'path';
 import * as log from './debug.js';
-import { strHasASCII, strHasHFKanaHira, strHasHiraKana, strOnlyASCII, strOnlyChinese, strOnlyJapanese, strOnlyJapaneseHan } from './unicode.js';
+import { strHasASCII, strHasHFKanaHira, strHasHiraKana, strOnlyASCII, strOnlyChinese, strOnlyHangul, strOnlyJapanese, strOnlyJapaneseHan } from './unicode.js';
 import { CHINESE_CHARS_3500, MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js';
 
-// https://github.com/bnoordhuis/node-iconv/
+// https://github.com/bnoordhuis/node-iconv/ 
 const ENCODING_FROM = [
     'SHIFT_JIS',
     'GBK',
@@ -26,8 +23,6 @@ const ENCODING_TO = [
     // 'EUC-KR',
 ]
 
-const ENCODING_TRY = ['SHIFT_JIS', 'UTF8']
-
 export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
 
 export const REGEX_MESSY_CJK = new RegExp(`[${MESSY_CJK_CHARS}]`, 'u')
@@ -43,9 +38,6 @@ export function charUnique(str) {
     return String.prototype.concat.call(...new Set(str));
 }
 
-const nowDateStr = dayjs().format("YYYYMMDDHHmmss");
-const tempfile = path.join(os.tmpdir(), `z_mediac_log_${nowDateStr}.txt`)
-
 export function checkBadUnicode(str) {
     const results = []
     if (str.includes('?') || str.includes('\ufffd')) {
@@ -76,6 +68,10 @@ export function checkBadUnicode(str) {
         // 乱码标志 Unicode私有区
         results.push([true, 5, `私有区`])
     }
+    if (/[\ufb50-\ufdff\ufe70-\ufeff]/u.test(str)) {
+        // 乱码标志 阿拉伯字符
+        results.push([true, 5, `阿拉伯字符`])
+    }
     if (/[\uff66-\uff9d]/u.test(str)) {
         // 暂时忽略，还比较常用
         // 乱码标志 半角平假名片假名
@@ -96,8 +92,8 @@ export function hasBadCJKChar(str) {
     return REGEX_MESSY_CJK.test(str) || REGEX_MESSY_CJK_EXT.test(str)
 }
 
-export function fixCJKEnc(str) {
-    let results = fixCJKEncImpl(str)
+export function decodeText(str) {
+    let results = tryDecodeText(str)
     results = results.filter(r => r[2] >= 0).sort((a, b) => b[2] - a[2])
     log.debug('==================================')
     log.debug(str)
@@ -109,110 +105,110 @@ export function fixCJKEnc(str) {
     return results[0] || [str, false, 0, 'fallback'];
 }
 
-export function fixCJKEncImpl(str,
+export function tryDecodeText(str,
     fromEnc = ENCODING_FROM,
     toEnc = ENCODING_TO,
     threhold = 10) {
     if (str.includes('?') || str.includes('\ufffd')) {
-        return [[str, false, 0, '信息丢失', ''],]
+        return [[str, false, 0, '[乱码字符]'],]
     }
 
+    fromEnc = fromEnc.map(x => x.toLowerCase())
+    toEnc = toEnc.map(x => x.toLowerCase())
+
     let results = []
     if (strOnlyASCII(str)) {
         // results.push([str, false, 0])
-        return [[str, false, 0, '全英文数字', ''],]
+        return [[str, false, 100, '[ASCII]'],]
     }
+    const messyUnicode = REGEX_MESSY_UNICODE.test(str)
+    const messyCJK = REGEX_MESSY_CJK.test(str)
+    const messyCJKExt = REGEX_MESSY_CJK_EXT.test(str)
     log.info('---------------------')
-    log.info('fixCJKEnc', str)
-    if (!REGEX_MESSY_UNICODE.test(str)
-        && !REGEX_MESSY_CJK.test(str)
-        && !REGEX_MESSY_CJK_EXT.test(str)) {
-        if (RE_CHARS_MOST_USED.test(str)) {
-            results.push([str, false, 100, '常用汉字0', ''])
-        }
-        // else if (strOnlyChinese(str)) {
-        //     results.push([str, false, 99, '全中文01', ''])
-        // }
-        else if (strHasHFKanaHira(str)) {
-            // 包含不用的全角半角平假名片假名
-            results.push([str, false, 65, '含半角假名0', ''])
-        }
-        else {
-            // fs.appendFileSync(tempfile, str + '\n')
-            return [[str, false, 0, '忽略0', ''],]
-        }
-    } else {
+    log.info('tryDecodeText', str)
+    if (messyUnicode || messyCJK || messyCJKExt) {
         if (strOnlyChinese(str) && !REGEX_MESSY_CJK_EXT.test(str)) {
-            return [[str, false, 0, `全中文02`, `${REGEX_MESSY_UNICODE.test(str)}`],]
+            return [[str, false, 100, `[全中文]`],]
         }
     }
-    if ((strHasHiraKana(str) || strHasASCII(str))
-        && strOnlyJapanese(str) && !REGEX_MESSY_CJK.test(str)) {
-        results.push([str, false, 99, '全日文01', ''])
+    if (RE_CHARS_MOST_USED.test(str)) {
+        results.push([str, false, 100, '[常用汉字]'])
+    }
+    else if (strHasHFKanaHira(str)) {
+        // 包含不用的全角半角平假名片假名
+        results.push([str, false, 65, '[半角假名]'])
+    }
+    else {
+        // fs.appendFileSync(tempfile, str + '\n')
+        return [[str, false, 0, '[无乱码]', ''],]
+    }
+
+    if (!!REGEX_MESSY_CJK.test(str)
+        && (strHasHiraKana(str) || strHasASCII(str))
+        && strOnlyJapanese(str)) {
+        results.push([str, false, 99, '[全日文1]'])
     }
     else if (strOnlyJapanese(str)) {
-        results.push([str, false, 80, '全日文02', ''])
+        results.push([str, false, 80, '[全日文2]'])
     }
-    // log.showRed(str)
-    // log.show(Array.from(str).map(c => c.codePointAt(0).toString(16)).join(' '))
+
     for (const enc1 of fromEnc) {
         for (const enc2 of toEnc) {
+            // 忽略解码编码相同的情况
             if (enc1 === enc2) { continue }
             try {
                 const strBuffer = iconv.encode(str, enc1)
                 let strDecoded = iconv.decode(strBuffer, enc2)
                 const badDecoded = checkBadUnicode(strDecoded)
                 // const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
-                log.info(enc1, enc2, strDecoded, badDecoded)
+                log.debug(enc1, enc2, strDecoded, badDecoded)
                 // 如果含有乱码字符
                 if (badDecoded?.length > 0) {
                     for (const item of badDecoded) {
                         results.push([strDecoded, ...item, `${enc1}=>${enc2}`])
                     }
-
                     continue;
                 }
-                // log.showRed('========')
-                // log.showRed(str)
-                // log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
-                // log.show(strDecoded, enc1, enc2)
-
                 const onlyASCII = strOnlyASCII(strDecoded)
                 const onlyCN = strOnlyChinese(strDecoded)
                 const onlyJP = strOnlyJapanese(strDecoded)
                 const onlyJPHan = strOnlyJapaneseHan(strDecoded)
+                const onlyKR = strOnlyHangul(strDecoded)
                 const hasHiraKana = strHasHiraKana(strDecoded)
                 const hasHFHiraKana = strHasHFKanaHira(strDecoded)
                 const messyUnicode = REGEX_MESSY_UNICODE.test(strDecoded)
                 const messyCJK = REGEX_MESSY_CJK.test(strDecoded)
                 const messyCJKExt = REGEX_MESSY_CJK_EXT.test(strDecoded)
 
-                log.debug(strDecoded, onlyASCII, onlyCN, onlyJP, onlyJPHan, messyCJK)
-                log.debug(strDecoded, hasHiraKana, hasHFHiraKana, messyUnicode, messyCJK)
+                log.debug(strDecoded, 'cn', onlyCN, 'jp', onlyJP, 'jhan', onlyJPHan, 'kr', onlyKR)
+                log.debug(strDecoded, 'hk', hasHiraKana, 'hf', hasHFHiraKana, 'mu', messyUnicode, 'mc', messyCJK)
 
                 if (onlyASCII && !strDecoded.includes('?')) {
-                    results.push([strDecoded, true, 99, `全英文数字`, `${enc1}=>${enc2}`])
+                    results.push([strDecoded, true, 99, `ASCII`, `${enc1}=>${enc2}`])
                     break
                 }
                 if (RE_CHARS_MOST_USED.test(strDecoded)) {
                     results.push([strDecoded, true, 99, `常用汉字`, `${enc1}=>${enc2}`])
                     break
                 }
-                if (messyCJK || messyCJKExt) {
-                    results.push([strDecoded, true, 50, `CJK罕见`, `${enc1}=>${enc2}`])
-
-                }
                 if (onlyJP) {
-                    if (strHasHiraKana(strDecoded) || onlyJPHan) {
+                    if (!strHasHiraKana(strDecoded) && !onlyJPHan) {
                         results.push([strDecoded, true, 78, `日文字符`, `${enc1}=>${enc2}`])
                     }
                 }
                 else if (onlyCN) {
                     results.push([strDecoded, true, 76, `中文字符`, `${enc1}=>${enc2}`])
                 }
-                else if (strHasHFKanaHira || strHasHiraKana) {
+                else if (hasHiraKana || hasHFHiraKana) {
                     results.push([strDecoded, true, 65, `含日文假名`, ` ${enc1}=>${enc2}`])
                 }
+                else if (onlyKR) {
+                    results.push([strDecoded, true, 62, `韩文字符`, `${enc1}=>${enc2}`])
+                }
+                else if (messyCJK || messyCJKExt) {
+                    results.push([strDecoded, true, 51, `生僻字`, `${enc1}=>${enc2}`])
+                    // continue
+                }
                 else {
                     results.push([strDecoded, true, 60, `正常转换 ${onlyCN} ${onlyJP}`, ` ${enc1}=>${enc2}`])
                 }
@@ -222,7 +218,7 @@ export function fixCJKEncImpl(str,
             }
         }
     }
-    results.push([str, false, 70, '原始字符串'])
+    results.push([str, false, 70, '原始值'])
     results = results.filter(r => r[2] >= threhold).sort((a, b) => b[2] - a[2])
     log.debug(results.slice(3))
     return results;
diff --git a/lib/unicode.js b/lib/unicode.js
@@ -44,7 +44,7 @@ export const strHasJapanese = (str) => REGEX_JAPANESE.test(str);
 export const strOnlyJapanese = (str) => REGEX_JAPANESE_ONLY.test(str);
 
 // 导出一个正则表达式，用于判断字符串中是否包含平假名或片假名
-export const REGEX_HAS_HIRA_OR_KANA = /[\p{sc=Hira}\p{sc=Kana}]/u;
+export const REGEX_HAS_HIRA_OR_KANA = /[\u3040-\u30ff}]/u;
 /**
  * 判断给定字符串中是否包含平假名或片假名
  * @param {string} str 需要进行判断的字符串
@@ -53,7 +53,7 @@ export const REGEX_HAS_HIRA_OR_KANA = /[\p{sc=Hira}\p{sc=Kana}]/u;
 export const strHasHiraKana = (str) => REGEX_HAS_HIRA_OR_KANA.test(str);
 
 // 导出一个正则表达式，用于判断字符串是否只包含平假名或片假名
-export const REGEX_ONLY_HIRA_OR_KANA = /^[\p{sc=Hira}\p{sc=Kana}]+$/u;
+export const REGEX_ONLY_HIRA_OR_KANA = /^[\u3040-\u30ff]+$/u;
 /**
  * 判断给定字符串是否只包含平假名或片假名
  * @param {string} str 需要进行判断的字符串
@@ -111,7 +111,7 @@ export const REGEX_UNICODE_HAN_ONLY = /^[\p{sc=Hani}]+$/u;
  */
 export const strOnlyHani = (str) => REGEX_UNICODE_HAN_ONLY.test(str);
 
-// 日文半角和全角平假名片假名，一半不会用
+// 日文半角和全角平假名片假名，一般不会用
 export const REGEX_HF_KANA_HIRA = /[\uff66-\uff9d]/u;
 export const strHasHFKanaHira = (str) => REGEX_HF_KANA_HIRA.test(str);
 
@@ -148,7 +148,7 @@ export const strHasChineseHan3500 = (str) => REGEX_CHINESE_HAN_3500_ANY.test(str
 //-------------------------------------------------------------------------------
 
 // 定义一个正则表达式，用于匹配包含任何Unicode朝鲜语字符的字符串
-export const REGEX_HAS_HANGUL = /[\p{sc=Hang}]/u;
+export const REGEX_HAS_HANGUL = /[\p{sc=Hangul}]/u;
 /**
  * 检查字符串中是否包含朝鲜语字符
  * @param {string} str 需要检查的字符串
@@ -157,7 +157,7 @@ export const REGEX_HAS_HANGUL = /[\p{sc=Hang}]/u;
 export const strHasHangul = (str) => REGEX_HAS_HANGUL.test(str);
 
 // 定义一个正则表达式，用于匹配仅包含Unicode朝鲜语字符的字符串
-export const REGEX_ONLY_HANGUL = /^[\p{sc=Hang}]+$/u;
+export const REGEX_ONLY_HANGUL = /^[ A-Za-z0-9_\.-\\u00a1-\\u00b7\p{sc=Hangul}]+$/u;
 /**
  * 检查字符串是否仅由朝鲜语字符组成
  * @param {string} str 需要检查的字符串
diff --git a/scripts/fix_messy.js b/scripts/fix_messy.js
diff --git a/scripts/media_cli.js b/scripts/media_cli.js