Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 41 additions & 41 deletions kakaotalk_msg_preprocessor/kakaotalk_msg_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import re
from datetime import datetime
from typing import List, Dict
from pathlib import Path

# kakaotalk 메시지 중 날짜표현 패턴
# 이를 사용하여 파일이 추출된 소스와 메시지 구분과
kakaotalk_datetime_pattern_dict = {'window_ko_date': "-{15} [0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일 -{15}",
'window_ko_time': "((\[)([^\[])+(\])) ((\[오)\S [0-9]{1,2}:[0-9]{1,2}(\]))",
'android_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}",
'android_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",
}

def check_export_file_type(file_path,
datetime_pattern_dict=kakaotalk_datetime_pattern_dict):
# 이를 사용하여 파일이 추출된 소스와 메시지 구분과
kakaotalk_datetime_pattern_dict: Dict[str, str] = {'window_ko_date': "-{15} [0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일 -{15}",
'window_ko_time': "((\[)([^\[])+(\])) ((\[오)\S [0-9]{1,2}:[0-9]{1,2}(\]))",
'android_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}",
'android_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",
}


def check_export_file_type(file_path: str | Path,
datetime_pattern_dict: Dict[str, str] = kakaotalk_datetime_pattern_dict) -> str:
"""
Check the device type and language of kakaotalk_export_file.
It is done based on datetime patterns in file

Parameters
----------
file_path: string
Expand All @@ -32,28 +35,26 @@ def check_export_file_type(file_path,
# kakaotalk_include_date_pattern_dict = {'pc_ko': "([0-9]){4}-([0-9]){1,2}-([0-9]){1,2} ([0-9]){1,2}:([0-9]){1,2}",
# 'mobile_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}",
# 'mobile_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",}
with open(file_path, 'r') as f:

with open(file_path, 'r', encoding='utf-8') as f:
for counter in range(5):
line = f.readline()
if not line: break
if not line:
break

for file_type, pattern in datetime_pattern_dict.items():
if re.search(pattern, line):

return '_'.join(file_type.split('_')[:2])

print("Error: Cannot know the device type and language of the file.\n",
f"Please check the file is a kakaotalk export file or the export enviroment is in among {str(list(kakaotalk_include_date_pattern_dict.keys()))}")

return '_'.join(file_type.split('_')[:2])

print("Error: Cannot know the device type and language of the file.\n",
f"Please check the file is a kakaotalk export file or the export enviroment is in among {str(list(kakaotalk_datetime_pattern_dict.keys()))}")



def _str_to_datetime(file_type, text):
def _str_to_datetime(file_type : str, text: str):
kakaotalk_strptime_pattern_dict = {'ko': '%Y년 %m월 %d일 %p %I:%M',
'en': '%B %d, %Y, %I:%M %p',
}
'en': '%B %d, %Y, %I:%M %p',
}

language = file_type.split('_')[1]
if language == 'ko':
Expand All @@ -64,12 +65,12 @@ def _str_to_datetime(file_type, text):
return text_dt


def parse(file_type, file_path,
datetime_pattern_dict=kakaotalk_datetime_pattern_dict):
def parse(file_type: str, file_path: str | Path,
datetime_pattern_dict: Dict[str, str]=kakaotalk_datetime_pattern_dict) -> List[Dict[str, str]]:
"""
Parsing the text from a kaotalk_export_file.
This parser divide messages based on datetime_pattern.

Parameters
----------
file_type: string
Expand All @@ -88,14 +89,13 @@ def parse(file_type, file_path,
And it has keys, 'datetime,'user_name' and 'text'.
"""


msgs = []

if file_type == 'window_ko': # window
date_pattern = datetime_pattern_dict['window_ko_date']
time_pattern = datetime_pattern_dict['window_ko_time']

with open(file_path) as file:
with open(file_path, encoding='utf-8') as file:
# 줄바꿈되어있는 경우도 묶어주기 위해 buffer 사용
buffer = ''
date = ''
Expand All @@ -104,17 +104,17 @@ def parse(file_type, file_path,
# window파일의 데이트str(--------------- 2020년 6월 28일 일요일 ---------------)이거나 시간 str([김한길] [오후 2:15] htt)이면
if re.match(date_pattern, line) or re.match(time_pattern, line):
# buffer가 time_pattern으로 시작하는 경우만 추가해주기
if re.match(time_pattern, buffer):
if re.match(time_pattern, buffer):
buffer_tokens = buffer.split(']', maxsplit=2)
user_name = buffer_tokens[0].replace('[', '').strip()
time = buffer_tokens[1].replace('[', '').strip()
my_datetime = _str_to_datetime(file_type, f"{date} {time}")
text = buffer_tokens[2].strip()

msgs.append({'datetime': my_datetime,
'user_name': user_name,
'text': text
})
'user_name': user_name,
'text': text
})

if re.match(date_pattern, line): # window파일의 데이트str이면
date = line.replace('-', '').strip().rsplit(" ", 1)[0]
Expand All @@ -128,14 +128,14 @@ def parse(file_type, file_path,
else: # android
datetime_pattern = datetime_pattern_dict[file_type]
msg_exist_check_pattern = datetime_pattern + ",.*:"
with open(file_path) as file:

with open(file_path, encoding='utf-8') as file:
# 줄바꿈되어있는 경우도 저장하기 위해 buffer 사용
buffer=''
buffer = ''
for line in file:
if re.match(datetime_pattern, line):
if re.match(msg_exist_check_pattern, buffer):

temp_01_2_tokens = buffer.split(" : ", maxsplit=1)
temp_0_1_tokens = temp_01_2_tokens[0].rsplit(",", maxsplit=1)

Expand All @@ -145,13 +145,13 @@ def parse(file_type, file_path,
text = temp_01_2_tokens[1].strip()
msgs.append({'datetime': my_datetime,
'user_name': user_name,
'text': text
})
'text': text
})

buffer = line
else:
buffer += line

return msgs


Expand All @@ -171,7 +171,7 @@ def url_msg_extract(file_type, msgs):
for url in urls:
url_msgs.append({'datetime': msg['datetime'],
'user_name': msg['user_name'],
'url': ''.join(url)
})
'url': ''.join(url)
})

return url_msgs
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
description-file = README.md
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'kakaotalk_msg_preprocessor',
description = 'Preprocessor for kakaotalk message exported file',
long_description = open('README.md').read(),
long_description = open('README.md', encoding='utf-8').read(),
long_description_content_type="text/markdown",
version = '0.13',
license = 'MIT',
Expand Down