import os
from docx import Document
from docx.shared import Inches
# 可能需要导入其他模块,如处理 XML 或调用 API
# --- 配置 ---
SOURCE_DOCS_DIR = 'files/transform/docx/source_documents'
TARGET_TEMPLATE = 'files/transform/docx/template.docx'
OUTPUT_DOC_PATH = 'files/transform/docx/generated_document.docx'
EXTRACTION_RULES = { # 示例规则,需要根据你的实际情况修改
'source_doc_1.docx': {'heading_start': 'Chapter 3', 'heading_end': 'Chapter 4'},
'source_doc_2.docx': {'style_name': 'SourceHighlight'},
# ... 其他文档的规则
}
# --- 辅助函数 (示例) ---
def should_extract_paragraph(paragraph, rules):
# 实现基于规则判断段落是否应该提取的逻辑
# 例如:检查段落文本是否匹配、样式是否匹配等
# 返回 True 或 False
# (这部分逻辑需要根据你的具体规则编写)
style_name = paragraph.style.name
text = paragraph.text.strip()
# 示例:基于样式的简单规则
if 'style_name' in rules and style_name == rules['style_name']:
return True
# 示例:基于起始标题的简单规则(需要状态管理)
# if 'heading_start' in rules ... (需要更复杂的逻辑来跟踪当前章节)
return False # 默认不提取
def extract_content_from_doc(source_path, rules):
"""从单个源文档提取内容"""
extracted_elements = []
try:
source_doc = Document(source_path)
# 标记是否处于提取区域(例如,在特定章节之间)
in_extraction_zone = False # 需要根据规则调整初始状态
for element in source_doc.element.body:
# 处理不同类型的元素:段落、表格等
if element.tag.endswith('p'): # 是段落
paragraph = docx.text.paragraph.Paragraph(element, source_doc)
# --- 核心提取逻辑 ---
# 这里需要根据你的 EXTRACTION_RULES 实现复杂的判断逻辑
# 例如,判断是否遇到起始标题,是否遇到结束标题,段落样式是否匹配等
# 这是一个简化的示例,实际可能需要更精细的状态管理
if 'heading_start' in rules and paragraph.style.name.startswith('Heading') and rules['heading_start'] in paragraph.text:
in_extraction_zone = True
continue # 不提取起始标题本身?看需求
if 'heading_end' in rules and paragraph.style.name.startswith('Heading') and rules['heading_end'] in paragraph.text:
in_extraction_zone = False
continue # 到达结束标题,停止提取
if in_extraction_zone or should_extract_paragraph(paragraph, rules):
# 提取文本内容
text_content = paragraph.text
# 尝试提取基本格式(粗体、斜体) - 比较复杂,可能需要遍历 runs
# TODO: 提取图片 (需要检查段落中的 inline_shapes 或 runs 中的 drawing)
# TODO: 提取公式 (极具挑战性,见下文讨论)
extracted_elements.append({'type': 'paragraph', 'text': text_content, 'style': paragraph.style.name}) # 可以携带源样式名供参考
elif element.tag.endswith('tbl'): # 是表格
table = docx.table.Table(element, source_doc)
# --- 提取表格 ---
# TODO: 实现表格提取逻辑,可能需要检查是否在提取区域内
# if in_extraction_zone or table_should_be_extracted(table, rules):
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
extracted_elements.append({'type': 'table', 'data': table_data})
# --- 处理图片 ---
# 查找段落内的图片 (inline_shapes)
# paragraph = docx.text.paragraph.Paragraph(element, source_doc) # Re-get paragraph object if needed
# for run in paragraph.runs:
# if run.element.xpath('.//wp:inline | .//wp:anchor'): # Check for drawings
# # This part is complex: need to get image data (rId) and relate it back
# # to the actual image part in the docx package.
# # python-docx can extract images, but associating them perfectly
# # with their original position during extraction requires care.
# # Placeholder:
# # image_data = get_image_data(run, source_doc)
# # if image_data:
# # extracted_elements.append({'type': 'image', 'data': image_data, 'filename': f'img_{len(extracted_elements)}.png'})
pass # Placeholder for image extraction logic
except Exception as e:
print(f"Error processing {source_path}: {e}")
return extracted_elements
# --- 主流程 ---
all_extracted_content = []
source_files = [f for f in os.listdir(SOURCE_DOCS_DIR) if f.endswith('.docx')]
for filename in source_files:
source_path = os.path.join(SOURCE_DOCS_DIR, filename)
rules = EXTRACTION_RULES.get(filename, {}) # 获取该文件的提取规则
if rules: # 只处理定义了规则的文件
print(f"Extracting from: {filename}")
content = extract_content_from_doc(source_path, rules)
all_extracted_content.extend(content)
else:
print(f"Skipping {filename}, no rules defined.")
print(f"Total elements extracted: {len(all_extracted_content)}")