@@ -101,6 +101,37 @@ def get_image_list(result_list: list, zip_files: List[str]):
101101 return image_file_list
102102
103103
104+ def get_image_list_by_content (name : str , content : str , zip_files : List [str ]):
105+ image_file_list = []
106+ image_list = parse_md_image (content )
107+ for image in image_list :
108+ search = re .search ("\(.*\)" , image )
109+ if search :
110+ new_image_id = str (uuid .uuid7 ())
111+ source_image_path = search .group ().replace ('(' , '' ).replace (')' , '' )
112+ source_image_path = source_image_path .strip ().split (" " )[0 ]
113+ image_path = urljoin (name , '.' + source_image_path if source_image_path .startswith (
114+ '/' ) else source_image_path )
115+ if not zip_files .__contains__ (image_path ):
116+ continue
117+ if image_path .startswith ('oss/file/' ) or image_path .startswith ('oss/image/' ):
118+ image_id = image_path .replace ('oss/file/' , '' ).replace ('oss/file/' , '' )
119+ if is_valid_uuid (image_id ):
120+ image_file_list .append ({'source_file' : image_path ,
121+ 'image_id' : image_id })
122+ else :
123+ image_file_list .append ({'source_file' : image_path ,
124+ 'image_id' : new_image_id })
125+ content = content .replace (source_image_path , f'./oss/file/{ new_image_id } ' )
126+
127+ else :
128+ image_file_list .append ({'source_file' : image_path ,
129+ 'image_id' : new_image_id })
130+ content = content .replace (source_image_path , f'./oss/file/{ new_image_id } ' )
131+
132+ return image_file_list , content
133+
134+
104135def get_file_name (file_name ):
105136 try :
106137 file_name_code = file_name .encode ('cp437' )
@@ -171,17 +202,12 @@ def get_content(self, file, save_image):
171202 """
172203 buffer = file .read () if hasattr (file , 'read' ) else None
173204 bytes_io = io .BytesIO (buffer ) if buffer is not None else io .BytesIO (file )
174- md_items = [] # 存储 (md_text, source_file_path)
175- image_mode_list = []
176-
177- import posixpath
178-
179- def is_image_name (name : str ):
180- ext = posixpath .splitext (name .lower ())[1 ]
181- return ext in ('.png' , '.jpg' , '.jpeg' , '.gif' , '.bmp' , '.webp' , '.svg' )
205+ image_list = []
206+ content_parts = []
182207
183208 with zipfile .ZipFile (bytes_io , 'r' ) as zip_ref :
184209 files = zip_ref .namelist ()
210+ file_content_list = []
185211 for inner_name in files :
186212 if inner_name .endswith ('/' ) or inner_name .startswith ('__MACOSX' ):
187213 continue
@@ -190,77 +216,35 @@ def is_image_name(name: str):
190216 real_name = get_file_name (zf .name )
191217 except Exception :
192218 real_name = zf .name
193- raw = zf .read ()
194- # 图片直接收集
195- if is_image_name (real_name ):
196- image_id = str (uuid .uuid7 ())
197- fmodel = File (
198- id = image_id ,
199- file_name = os .path .basename (real_name ),
200- meta = {'debug' : False , 'content' : raw }
201- )
202- image_mode_list .append (fmodel )
203- continue
204219
205220 # 为 split_handle 提供可重复读取的 file-like 对象
206- inner_file = io .BytesIO (raw )
207- inner_file .name = real_name
208-
209- # 尝试使用已注册的 split handle 的 get_content
210- md_text = None
221+ zf .name = real_name
211222 for split_handle in split_handles :
212223 # 准备一个简单的 get_buffer 回调,返回当前 raw
213- get_buffer = lambda f , _raw = raw : _raw
214- if split_handle .support (inner_file , get_buffer ):
215- inner_file .seek (0 )
216- md_text = split_handle .get_content (inner_file , save_image )
224+ get_buffer = FileBufferHandle ().get_buffer
225+ if split_handle .support (zf , get_buffer ):
226+ row = get_buffer (zf )
227+ md_text = split_handle .get_content (io .BytesIO (row ), save_image )
228+ file_content_list .append ({'content' : md_text , 'name' : real_name })
217229 break
218-
219- # 如果没有任何 split_handle 处理,按文本解码作为后备
220- if md_text is None :
221- enc = detect (raw ).get ('encoding' ) or 'utf-8'
222- try :
223- md_text = raw .decode (enc , errors = 'ignore' )
224- except Exception :
225- md_text = raw .decode ('utf-8' , errors = 'ignore' )
226-
227- if isinstance (md_text , str ) and md_text .strip ():
228- # 保存 md 文本与其所在的文件路径,后面统一做图片路径替换
229- md_items .append ((md_text , real_name ))
230+ for file_content in file_content_list :
231+ _image_list , content = get_image_list_by_content (file_content .get ('name' ), file_content .get ("content" ),
232+ files )
233+ content_parts .append (content )
234+ for image in _image_list :
235+ image_list .append (image )
230236
231237 # 将收集到的图片通过回调保存(一次性)
232- if image_mode_list :
238+ if image_list :
239+ image_mode_list = []
240+ for image in image_list :
241+ with zip_ref .open (image .get ('source_file' )) as f :
242+ i = File (
243+ id = image .get ('image_id' ),
244+ file_name = os .path .basename (image .get ('source_file' )),
245+ meta = {'debug' : False , 'content' : f .read ()} # 这里的content是二进制数据
246+ )
247+ image_mode_list .append (i )
233248 save_image (image_mode_list )
234249
235- # 后处理:在每个 md 片段中将相对/绝对引用替换为已保存图片的 oss 路径
236- content_parts = []
237- for md_text , base_name in md_items :
238- image_refs = parse_md_image (md_text )
239- for image in image_refs :
240- search = re .search (r"\(.*\)" , image )
241- if not search :
242- continue
243- source_image_path = search .group ().strip ("()" ).split (" " )[0 ]
244-
245- # 规范化 zip 内部路径:若以 '/' 开头,视为相对于 zip 根,否则相对于 base_name 的目录
246- if source_image_path .startswith ('/' ):
247- joined = posixpath .normpath (source_image_path .lstrip ('/' ))
248- else :
249- base_dir = posixpath .dirname (base_name )
250- joined = posixpath .normpath (posixpath .join (base_dir , source_image_path ))
251-
252- # 匹配已收集图片:以文件名做匹配(zip 中的文件名通常是不含反斜杠的 POSIX 风格)
253- matched = None
254- for img_model in image_mode_list :
255- if img_model .file_name == posixpath .basename (joined ):
256- matched = img_model
257- break
258-
259- if matched :
260- md_text = md_text .replace (source_image_path , f'./oss/file/{ matched .id } ' )
261-
262- content_parts .append (md_text )
263-
264250 return '\n \n ' .join (content_parts )
265-
266-
0 commit comments