From de01a80cf7840d9620fa1e929214e8495354f33e Mon Sep 17 00:00:00 2001 From: WENQLUO Date: Fri, 13 Jun 2025 12:11:41 +0800 Subject: [PATCH] update scripts --- process_documents.py | 135 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 process_documents.py diff --git a/process_documents.py b/process_documents.py new file mode 100644 index 0000000..af64c98 --- /dev/null +++ b/process_documents.py @@ -0,0 +1,135 @@ +import os +import fitz # PyMuPDF +from azure.storage.blob import BlobServiceClient +import re + +# 定义路径和连接信息 +pdf_path = r"用户手册.pdf" # 输入的 PDF 文件路径 +output_pdf_dir = r"pdf" # 分割后的 PDF 文件输出目录 +md_folder = r"md" # Markdown 文件输出目录 +image_folder = r"images" # 图片文件输出目录 +azure_connect_str = "DefaultEndpointsProtocol=https;AccountName=st0poc0test0ai0carapp;AccountKey=6eZFjK0ls1r0R4/wZZQvuqaJwepWBPuqb2ni1xrkuKIswhFLopgnXKZju3etWajnO0Mx5ZvYkwen+AStF9a2sQ==;EndpointSuffix=core.chinacloudapi.cn" +azure_container_name = "ai-carapp" +# 定义前缀和后缀 +url_prefix = "https://st0poc0test0ai0carapp.blob.core.chinacloudapi.cn/ai-carapp/" +url_suffix = "?sv=2023-01-03&st=2025-06-12T12%3A15%3A30Z&se=2025-10-01T12%3A15%3A00Z&sr=c&sp=rl&sig=dlT4Rw4CW7iqYQ4XxAk77XwuFgWFwwpXdBD%2Fr44Kkjk%3D" + +# 创建必要的目录 +os.makedirs(output_pdf_dir, exist_ok=True) +os.makedirs(md_folder, exist_ok=True) +os.makedirs(image_folder, exist_ok=True) + +# 1. 分割 PDF 文件 +def split_pdf_by_large_title(pdf_path, output_dir): + doc = fitz.open(pdf_path) + total_pages = len(doc) + large_titles = [] + large_title_info = {"fonts": ["MicrosoftYaHei-Bold", "TimesNewRomanPS-BoldMT"], "size": 24.0} + + for page_num in range(total_pages): + page = doc.load_page(page_num) + text_blocks = page.get_text("dict")["blocks"] + for block in text_blocks: + if block["type"] == 0: + for line in block["lines"]: + line_text = "" + is_large_title_line = False + for span in line["spans"]: + font = span["font"] + size = span["size"] + text = span["text"].strip() + if font in large_title_info["fonts"] and size == large_title_info["size"]: + line_text += text + " " + is_large_title_line = True + else: + line_text = "" + is_large_title_line = False + break + if is_large_title_line: + line_text = line_text.strip() + if line_text: + large_titles.append({"title": line_text, "page_num": page_num}) + + for i in range(len(large_titles)): + start_title = large_titles[i] + start_page = start_title["page_num"] + end_page = large_titles[i + 1]["page_num"] if i + 1 < len(large_titles) else total_pages + new_doc = fitz.open() + new_doc.insert_pdf(doc, from_page=start_page, to_page=end_page - 1) + new_pdf_path = os.path.join(output_dir, f"{start_title['title']}.pdf") + new_doc.save(new_pdf_path) + new_doc.close() + print(f"已保存大标题 {start_title['title']} 从页面 {start_page + 1} 到 {end_page} 到文件 {new_pdf_path}") + + doc.close() + +# 2. 生成 Markdown 文件 +def generate_markdown_from_pdf(pdf_folder, md_folder): + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.data.dataset import PymuDocDataset + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.config.enums import SupportedPdfParseMethod + + for pdf_file_name in os.listdir(pdf_folder): + if pdf_file_name.lower().endswith('.pdf'): + pdf_file_path = os.path.join(pdf_folder, pdf_file_name) + name_without_suff = pdf_file_name.split(".")[0] + local_image_dir = os.path.join(image_folder, name_without_suff) + os.makedirs(local_image_dir, exist_ok=True) + + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_path) + ds = PymuDocDataset(pdf_bytes) + + if ds.classify() == SupportedPdfParseMethod.OCR: + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(FileBasedDataWriter(local_image_dir)) + else: + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(FileBasedDataWriter(local_image_dir)) + + md_content = pipe_result.get_markdown(os.path.basename(local_image_dir)) + md_file_path = os.path.join(md_folder, f"{name_without_suff}.md") + with open(md_file_path, 'w', encoding='utf-8') as md_file: + md_file.write(md_content) + print(f"已生成 Markdown 文件: {md_file_path}") + +# 3. 上传图片到 Azure Blob Storage +def upload_images_to_azure(image_folder, connect_str, container_name): + blob_service_client = BlobServiceClient.from_connection_string(connect_str) + container_client = blob_service_client.get_container_client(container_name) + + for root, _, files in os.walk(image_folder): + for file_name in files: + if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): + local_file_path = os.path.join(root, file_name) + blob_name = file_name + blob_client = container_client.get_blob_client(blob_name) + with open(local_file_path, "rb") as data: + blob_client.upload_blob(data, overwrite=True) + print(f"已上传图片: {blob_name}") + +# 4. 修改 Markdown 文件中的图片 URL +def update_md_images(md_folder, url_prefix, url_suffix): + for md_file_name in os.listdir(md_folder): + if md_file_name.lower().endswith('.md'): + md_file_path = os.path.join(md_folder, md_file_name) + with open(md_file_path, 'r', encoding='utf-8') as file: + content = file.read() + + def replace_image_path(match): + image_path = match.group(2) + return f"![]({url_prefix}{os.path.basename(image_path)}{url_suffix})" + + updated_content = re.sub(r'!\[(.*?)\]\((.*?)\)', replace_image_path, content) + with open(md_file_path, 'w', encoding='utf-8') as file: + file.write(updated_content) + print(f"已更新 Markdown 文件: {md_file_path}") + +# 执行流程 +split_pdf_by_large_title(pdf_path, output_pdf_dir) +generate_markdown_from_pdf(output_pdf_dir, md_folder) +upload_images_to_azure(image_folder, azure_connect_str, azure_container_name) +update_md_images(md_folder, url_prefix, url_suffix) + +print("所有操作已完成!") \ No newline at end of file