diff --git a/kakaotalk_msg_preprocessor/kakaotalk_msg_preprocessor.py b/kakaotalk_msg_preprocessor/kakaotalk_msg_preprocessor.py index 911c746..d5317ed 100644 --- a/kakaotalk_msg_preprocessor/kakaotalk_msg_preprocessor.py +++ b/kakaotalk_msg_preprocessor/kakaotalk_msg_preprocessor.py @@ -1,20 +1,23 @@ import re from datetime import datetime +from typing import List, Dict +from pathlib import Path # kakaotalk 메시지 중 날짜표현 패턴 -# 이를 사용하여 파일이 추출된 소스와 메시지 구분과 -kakaotalk_datetime_pattern_dict = {'window_ko_date': "-{15} [0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일 -{15}", - 'window_ko_time': "((\[)([^\[])+(\])) ((\[오)\S [0-9]{1,2}:[0-9]{1,2}(\]))", - 'android_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}", - 'android_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM", - } - -def check_export_file_type(file_path, - datetime_pattern_dict=kakaotalk_datetime_pattern_dict): +# 이를 사용하여 파일이 추출된 소스와 메시지 구분과 +kakaotalk_datetime_pattern_dict: Dict[str, str] = {'window_ko_date': "-{15} [0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일 -{15}", + 'window_ko_time': "((\[)([^\[])+(\])) ((\[오)\S [0-9]{1,2}:[0-9]{1,2}(\]))", + 'android_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}", + 'android_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM", + } + + +def check_export_file_type(file_path: str | Path, + datetime_pattern_dict: Dict[str, str] = kakaotalk_datetime_pattern_dict) -> str: """ Check the device type and language of kakaotalk_export_file. It is done based on datetime patterns in file - + Parameters ---------- file_path: string @@ -32,28 +35,26 @@ def check_export_file_type(file_path, # kakaotalk_include_date_pattern_dict = {'pc_ko': "([0-9]){4}-([0-9]){1,2}-([0-9]){1,2} ([0-9]){1,2}:([0-9]){1,2}", # 'mobile_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}", # 'mobile_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",} - - with open(file_path, 'r') as f: + + with open(file_path, 'r', encoding='utf-8') as f: for counter in range(5): line = f.readline() - if not line: break + if not line: + break for file_type, pattern in datetime_pattern_dict.items(): if re.search(pattern, line): - - return '_'.join(file_type.split('_')[:2]) - - print("Error: Cannot know the device type and language of the file.\n", - f"Please check the file is a kakaotalk export file or the export enviroment is in among {str(list(kakaotalk_include_date_pattern_dict.keys()))}") + return '_'.join(file_type.split('_')[:2]) + print("Error: Cannot know the device type and language of the file.\n", + f"Please check the file is a kakaotalk export file or the export enviroment is in among {str(list(kakaotalk_datetime_pattern_dict.keys()))}") - -def _str_to_datetime(file_type, text): +def _str_to_datetime(file_type : str, text: str): kakaotalk_strptime_pattern_dict = {'ko': '%Y년 %m월 %d일 %p %I:%M', - 'en': '%B %d, %Y, %I:%M %p', - } + 'en': '%B %d, %Y, %I:%M %p', + } language = file_type.split('_')[1] if language == 'ko': @@ -64,12 +65,12 @@ def _str_to_datetime(file_type, text): return text_dt -def parse(file_type, file_path, - datetime_pattern_dict=kakaotalk_datetime_pattern_dict): +def parse(file_type: str, file_path: str | Path, + datetime_pattern_dict: Dict[str, str]=kakaotalk_datetime_pattern_dict) -> List[Dict[str, str]]: """ Parsing the text from a kaotalk_export_file. This parser divide messages based on datetime_pattern. - + Parameters ---------- file_type: string @@ -88,14 +89,13 @@ def parse(file_type, file_path, And it has keys, 'datetime,'user_name' and 'text'. """ - msgs = [] if file_type == 'window_ko': # window date_pattern = datetime_pattern_dict['window_ko_date'] time_pattern = datetime_pattern_dict['window_ko_time'] - with open(file_path) as file: + with open(file_path, encoding='utf-8') as file: # 줄바꿈되어있는 경우도 묶어주기 위해 buffer 사용 buffer = '' date = '' @@ -104,17 +104,17 @@ def parse(file_type, file_path, # window파일의 데이트str(--------------- 2020년 6월 28일 일요일 ---------------)이거나 시간 str([김한길] [오후 2:15] htt)이면 if re.match(date_pattern, line) or re.match(time_pattern, line): # buffer가 time_pattern으로 시작하는 경우만 추가해주기 - if re.match(time_pattern, buffer): + if re.match(time_pattern, buffer): buffer_tokens = buffer.split(']', maxsplit=2) user_name = buffer_tokens[0].replace('[', '').strip() time = buffer_tokens[1].replace('[', '').strip() my_datetime = _str_to_datetime(file_type, f"{date} {time}") text = buffer_tokens[2].strip() - + msgs.append({'datetime': my_datetime, - 'user_name': user_name, - 'text': text - }) + 'user_name': user_name, + 'text': text + }) if re.match(date_pattern, line): # window파일의 데이트str이면 date = line.replace('-', '').strip().rsplit(" ", 1)[0] @@ -128,14 +128,14 @@ def parse(file_type, file_path, else: # android datetime_pattern = datetime_pattern_dict[file_type] msg_exist_check_pattern = datetime_pattern + ",.*:" - - with open(file_path) as file: + + with open(file_path, encoding='utf-8') as file: # 줄바꿈되어있는 경우도 저장하기 위해 buffer 사용 - buffer='' + buffer = '' for line in file: if re.match(datetime_pattern, line): if re.match(msg_exist_check_pattern, buffer): - + temp_01_2_tokens = buffer.split(" : ", maxsplit=1) temp_0_1_tokens = temp_01_2_tokens[0].rsplit(",", maxsplit=1) @@ -145,13 +145,13 @@ def parse(file_type, file_path, text = temp_01_2_tokens[1].strip() msgs.append({'datetime': my_datetime, 'user_name': user_name, - 'text': text - }) + 'text': text + }) buffer = line else: buffer += line - + return msgs @@ -171,7 +171,7 @@ def url_msg_extract(file_type, msgs): for url in urls: url_msgs.append({'datetime': msg['datetime'], 'user_name': msg['user_name'], - 'url': ''.join(url) - }) + 'url': ''.join(url) + }) return url_msgs \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..224a779 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py index 1517216..821421b 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'kakaotalk_msg_preprocessor', description = 'Preprocessor for kakaotalk message exported file', - long_description = open('README.md').read(), + long_description = open('README.md', encoding='utf-8').read(), long_description_content_type="text/markdown", version = '0.13', license = 'MIT',