update scripts
This commit is contained in:
commit
de01a80cf7
135
process_documents.py
Normal file
135
process_documents.py
Normal file
@ -0,0 +1,135 @@
|
||||
import os
|
||||
import fitz # PyMuPDF
|
||||
from azure.storage.blob import BlobServiceClient
|
||||
import re
|
||||
|
||||
# 定义路径和连接信息
|
||||
pdf_path = r"用户手册.pdf" # 输入的 PDF 文件路径
|
||||
output_pdf_dir = r"pdf" # 分割后的 PDF 文件输出目录
|
||||
md_folder = r"md" # Markdown 文件输出目录
|
||||
image_folder = r"images" # 图片文件输出目录
|
||||
azure_connect_str = "DefaultEndpointsProtocol=https;AccountName=st0poc0test0ai0carapp;AccountKey=6eZFjK0ls1r0R4/wZZQvuqaJwepWBPuqb2ni1xrkuKIswhFLopgnXKZju3etWajnO0Mx5ZvYkwen+AStF9a2sQ==;EndpointSuffix=core.chinacloudapi.cn"
|
||||
azure_container_name = "ai-carapp"
|
||||
# 定义前缀和后缀
|
||||
url_prefix = "https://st0poc0test0ai0carapp.blob.core.chinacloudapi.cn/ai-carapp/"
|
||||
url_suffix = "?sv=2023-01-03&st=2025-06-12T12%3A15%3A30Z&se=2025-10-01T12%3A15%3A00Z&sr=c&sp=rl&sig=dlT4Rw4CW7iqYQ4XxAk77XwuFgWFwwpXdBD%2Fr44Kkjk%3D"
|
||||
|
||||
# 创建必要的目录
|
||||
os.makedirs(output_pdf_dir, exist_ok=True)
|
||||
os.makedirs(md_folder, exist_ok=True)
|
||||
os.makedirs(image_folder, exist_ok=True)
|
||||
|
||||
# 1. 分割 PDF 文件
|
||||
def split_pdf_by_large_title(pdf_path, output_dir):
|
||||
doc = fitz.open(pdf_path)
|
||||
total_pages = len(doc)
|
||||
large_titles = []
|
||||
large_title_info = {"fonts": ["MicrosoftYaHei-Bold", "TimesNewRomanPS-BoldMT"], "size": 24.0}
|
||||
|
||||
for page_num in range(total_pages):
|
||||
page = doc.load_page(page_num)
|
||||
text_blocks = page.get_text("dict")["blocks"]
|
||||
for block in text_blocks:
|
||||
if block["type"] == 0:
|
||||
for line in block["lines"]:
|
||||
line_text = ""
|
||||
is_large_title_line = False
|
||||
for span in line["spans"]:
|
||||
font = span["font"]
|
||||
size = span["size"]
|
||||
text = span["text"].strip()
|
||||
if font in large_title_info["fonts"] and size == large_title_info["size"]:
|
||||
line_text += text + " "
|
||||
is_large_title_line = True
|
||||
else:
|
||||
line_text = ""
|
||||
is_large_title_line = False
|
||||
break
|
||||
if is_large_title_line:
|
||||
line_text = line_text.strip()
|
||||
if line_text:
|
||||
large_titles.append({"title": line_text, "page_num": page_num})
|
||||
|
||||
for i in range(len(large_titles)):
|
||||
start_title = large_titles[i]
|
||||
start_page = start_title["page_num"]
|
||||
end_page = large_titles[i + 1]["page_num"] if i + 1 < len(large_titles) else total_pages
|
||||
new_doc = fitz.open()
|
||||
new_doc.insert_pdf(doc, from_page=start_page, to_page=end_page - 1)
|
||||
new_pdf_path = os.path.join(output_dir, f"{start_title['title']}.pdf")
|
||||
new_doc.save(new_pdf_path)
|
||||
new_doc.close()
|
||||
print(f"已保存大标题 {start_title['title']} 从页面 {start_page + 1} 到 {end_page} 到文件 {new_pdf_path}")
|
||||
|
||||
doc.close()
|
||||
|
||||
# 2. 生成 Markdown 文件
|
||||
def generate_markdown_from_pdf(pdf_folder, md_folder):
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||
from magic_pdf.data.dataset import PymuDocDataset
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||
|
||||
for pdf_file_name in os.listdir(pdf_folder):
|
||||
if pdf_file_name.lower().endswith('.pdf'):
|
||||
pdf_file_path = os.path.join(pdf_folder, pdf_file_name)
|
||||
name_without_suff = pdf_file_name.split(".")[0]
|
||||
local_image_dir = os.path.join(image_folder, name_without_suff)
|
||||
os.makedirs(local_image_dir, exist_ok=True)
|
||||
|
||||
reader1 = FileBasedDataReader("")
|
||||
pdf_bytes = reader1.read(pdf_file_path)
|
||||
ds = PymuDocDataset(pdf_bytes)
|
||||
|
||||
if ds.classify() == SupportedPdfParseMethod.OCR:
|
||||
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||
pipe_result = infer_result.pipe_ocr_mode(FileBasedDataWriter(local_image_dir))
|
||||
else:
|
||||
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||
pipe_result = infer_result.pipe_txt_mode(FileBasedDataWriter(local_image_dir))
|
||||
|
||||
md_content = pipe_result.get_markdown(os.path.basename(local_image_dir))
|
||||
md_file_path = os.path.join(md_folder, f"{name_without_suff}.md")
|
||||
with open(md_file_path, 'w', encoding='utf-8') as md_file:
|
||||
md_file.write(md_content)
|
||||
print(f"已生成 Markdown 文件: {md_file_path}")
|
||||
|
||||
# 3. 上传图片到 Azure Blob Storage
|
||||
def upload_images_to_azure(image_folder, connect_str, container_name):
|
||||
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
|
||||
container_client = blob_service_client.get_container_client(container_name)
|
||||
|
||||
for root, _, files in os.walk(image_folder):
|
||||
for file_name in files:
|
||||
if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
|
||||
local_file_path = os.path.join(root, file_name)
|
||||
blob_name = file_name
|
||||
blob_client = container_client.get_blob_client(blob_name)
|
||||
with open(local_file_path, "rb") as data:
|
||||
blob_client.upload_blob(data, overwrite=True)
|
||||
print(f"已上传图片: {blob_name}")
|
||||
|
||||
# 4. 修改 Markdown 文件中的图片 URL
|
||||
def update_md_images(md_folder, url_prefix, url_suffix):
|
||||
for md_file_name in os.listdir(md_folder):
|
||||
if md_file_name.lower().endswith('.md'):
|
||||
md_file_path = os.path.join(md_folder, md_file_name)
|
||||
with open(md_file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
|
||||
def replace_image_path(match):
|
||||
image_path = match.group(2)
|
||||
return f"}{url_suffix})"
|
||||
|
||||
updated_content = re.sub(r'!\[(.*?)\]\((.*?)\)', replace_image_path, content)
|
||||
with open(md_file_path, 'w', encoding='utf-8') as file:
|
||||
file.write(updated_content)
|
||||
print(f"已更新 Markdown 文件: {md_file_path}")
|
||||
|
||||
# 执行流程
|
||||
split_pdf_by_large_title(pdf_path, output_pdf_dir)
|
||||
generate_markdown_from_pdf(output_pdf_dir, md_folder)
|
||||
upload_images_to_azure(image_folder, azure_connect_str, azure_container_name)
|
||||
update_md_images(md_folder, url_prefix, url_suffix)
|
||||
|
||||
print("所有操作已完成!")
|
||||
Loading…
x
Reference in New Issue
Block a user