Skip to content

Commit fd400f2

Browse files
shaohuzhang1liuruibin
authored andcommitted
fix: The image uploaded from the workflow knowledge base zip file cannot be parsed (#4503)
1 parent d349fd5 commit fd400f2

File tree

2 files changed

+58
-73
lines changed

2 files changed

+58
-73
lines changed

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def save_image(image_list):
5353
source_id=meta['application_id'] if meta['application_id'] else meta['knowledge_id'],
5454
meta=meta
5555
)
56-
new_file.save(file_bytes)
56+
if not QuerySet(File).filter(id=new_file.id).exists():
57+
new_file.save(file_bytes)
5758

5859
document_list = []
5960
for doc in document:

apps/common/handle/impl/text/zip_split_handle.py

Lines changed: 56 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,37 @@ def get_image_list(result_list: list, zip_files: List[str]):
101101
return image_file_list
102102

103103

104+
def get_image_list_by_content(name: str, content: str, zip_files: List[str]):
105+
image_file_list = []
106+
image_list = parse_md_image(content)
107+
for image in image_list:
108+
search = re.search("\(.*\)", image)
109+
if search:
110+
new_image_id = str(uuid.uuid7())
111+
source_image_path = search.group().replace('(', '').replace(')', '')
112+
source_image_path = source_image_path.strip().split(" ")[0]
113+
image_path = urljoin(name, '.' + source_image_path if source_image_path.startswith(
114+
'/') else source_image_path)
115+
if not zip_files.__contains__(image_path):
116+
continue
117+
if image_path.startswith('oss/file/') or image_path.startswith('oss/image/'):
118+
image_id = image_path.replace('oss/file/', '').replace('oss/file/', '')
119+
if is_valid_uuid(image_id):
120+
image_file_list.append({'source_file': image_path,
121+
'image_id': image_id})
122+
else:
123+
image_file_list.append({'source_file': image_path,
124+
'image_id': new_image_id})
125+
content = content.replace(source_image_path, f'./oss/file/{new_image_id}')
126+
127+
else:
128+
image_file_list.append({'source_file': image_path,
129+
'image_id': new_image_id})
130+
content = content.replace(source_image_path, f'./oss/file/{new_image_id}')
131+
132+
return image_file_list, content
133+
134+
104135
def get_file_name(file_name):
105136
try:
106137
file_name_code = file_name.encode('cp437')
@@ -171,17 +202,12 @@ def get_content(self, file, save_image):
171202
"""
172203
buffer = file.read() if hasattr(file, 'read') else None
173204
bytes_io = io.BytesIO(buffer) if buffer is not None else io.BytesIO(file)
174-
md_items = [] # 存储 (md_text, source_file_path)
175-
image_mode_list = []
176-
177-
import posixpath
178-
179-
def is_image_name(name: str):
180-
ext = posixpath.splitext(name.lower())[1]
181-
return ext in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.svg')
205+
image_list = []
206+
content_parts = []
182207

183208
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
184209
files = zip_ref.namelist()
210+
file_content_list = []
185211
for inner_name in files:
186212
if inner_name.endswith('/') or inner_name.startswith('__MACOSX'):
187213
continue
@@ -190,77 +216,35 @@ def is_image_name(name: str):
190216
real_name = get_file_name(zf.name)
191217
except Exception:
192218
real_name = zf.name
193-
raw = zf.read()
194-
# 图片直接收集
195-
if is_image_name(real_name):
196-
image_id = str(uuid.uuid7())
197-
fmodel = File(
198-
id=image_id,
199-
file_name=os.path.basename(real_name),
200-
meta={'debug': False, 'content': raw}
201-
)
202-
image_mode_list.append(fmodel)
203-
continue
204219

205220
# 为 split_handle 提供可重复读取的 file-like 对象
206-
inner_file = io.BytesIO(raw)
207-
inner_file.name = real_name
208-
209-
# 尝试使用已注册的 split handle 的 get_content
210-
md_text = None
221+
zf.name = real_name
211222
for split_handle in split_handles:
212223
# 准备一个简单的 get_buffer 回调,返回当前 raw
213-
get_buffer = lambda f, _raw=raw: _raw
214-
if split_handle.support(inner_file, get_buffer):
215-
inner_file.seek(0)
216-
md_text = split_handle.get_content(inner_file, save_image)
224+
get_buffer = FileBufferHandle().get_buffer
225+
if split_handle.support(zf, get_buffer):
226+
row = get_buffer(zf)
227+
md_text = split_handle.get_content(io.BytesIO(row), save_image)
228+
file_content_list.append({'content': md_text, 'name': real_name})
217229
break
218-
219-
# 如果没有任何 split_handle 处理,按文本解码作为后备
220-
if md_text is None:
221-
enc = detect(raw).get('encoding') or 'utf-8'
222-
try:
223-
md_text = raw.decode(enc, errors='ignore')
224-
except Exception:
225-
md_text = raw.decode('utf-8', errors='ignore')
226-
227-
if isinstance(md_text, str) and md_text.strip():
228-
# 保存 md 文本与其所在的文件路径,后面统一做图片路径替换
229-
md_items.append((md_text, real_name))
230+
for file_content in file_content_list:
231+
_image_list, content = get_image_list_by_content(file_content.get('name'), file_content.get("content"),
232+
files)
233+
content_parts.append(content)
234+
for image in _image_list:
235+
image_list.append(image)
230236

231237
# 将收集到的图片通过回调保存(一次性)
232-
if image_mode_list:
238+
if image_list:
239+
image_mode_list = []
240+
for image in image_list:
241+
with zip_ref.open(image.get('source_file')) as f:
242+
i = File(
243+
id=image.get('image_id'),
244+
file_name=os.path.basename(image.get('source_file')),
245+
meta={'debug': False, 'content': f.read()} # 这里的content是二进制数据
246+
)
247+
image_mode_list.append(i)
233248
save_image(image_mode_list)
234249

235-
# 后处理:在每个 md 片段中将相对/绝对引用替换为已保存图片的 oss 路径
236-
content_parts = []
237-
for md_text, base_name in md_items:
238-
image_refs = parse_md_image(md_text)
239-
for image in image_refs:
240-
search = re.search(r"\(.*\)", image)
241-
if not search:
242-
continue
243-
source_image_path = search.group().strip("()").split(" ")[0]
244-
245-
# 规范化 zip 内部路径:若以 '/' 开头,视为相对于 zip 根,否则相对于 base_name 的目录
246-
if source_image_path.startswith('/'):
247-
joined = posixpath.normpath(source_image_path.lstrip('/'))
248-
else:
249-
base_dir = posixpath.dirname(base_name)
250-
joined = posixpath.normpath(posixpath.join(base_dir, source_image_path))
251-
252-
# 匹配已收集图片:以文件名做匹配(zip 中的文件名通常是不含反斜杠的 POSIX 风格)
253-
matched = None
254-
for img_model in image_mode_list:
255-
if img_model.file_name == posixpath.basename(joined):
256-
matched = img_model
257-
break
258-
259-
if matched:
260-
md_text = md_text.replace(source_image_path, f'./oss/file/{matched.id}')
261-
262-
content_parts.append(md_text)
263-
264250
return '\n\n'.join(content_parts)
265-
266-

0 commit comments

Comments
 (0)