parse_docx.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. #!/usr/bin/env python3
  2. """
  3. 解析 原文.docx 并将结构化内容插入数据库
  4. - 提取段落(含格式:字体、大小、粗体、斜体、颜色、对齐等)
  5. - 提取表格(含合并单元格)
  6. - 提取图片(转 base64)
  7. - 按文档顺序组装 blocks JSON
  8. - 插入到 node_properties.prop_json 作为附件的文档内容
  9. """
  10. import json
  11. import base64
  12. import re
  13. import sys
  14. import os
  15. from io import BytesIO
  16. from collections import OrderedDict
  17. import docx
  18. from docx import Document
  19. from docx.shared import Pt, Emu, Inches, Cm, Twips
  20. from docx.enum.text import WD_ALIGN_PARAGRAPH
  21. from docx.oxml.ns import qn
  22. from lxml import etree
  23. DOCX_PATH = os.path.join(os.path.dirname(__file__), "原文.docx")
  24. # ============================================================
  25. # 1. 图片提取
  26. # ============================================================
  27. def extract_images(doc):
  28. """提取文档中所有图片,返回 {rId: base64_data_uri}"""
  29. images = {}
  30. for rel_id, rel in doc.part.rels.items():
  31. if "image" in rel.reltype:
  32. blob = rel.target_part.blob
  33. # 判断图片类型
  34. target = rel.target_ref.lower()
  35. if target.endswith('.png'):
  36. mime = 'image/png'
  37. elif target.endswith('.jpg') or target.endswith('.jpeg'):
  38. mime = 'image/jpeg'
  39. elif target.endswith('.gif'):
  40. mime = 'image/gif'
  41. elif target.endswith('.bmp'):
  42. mime = 'image/bmp'
  43. elif target.endswith('.emf'):
  44. mime = 'image/x-emf'
  45. elif target.endswith('.wmf'):
  46. mime = 'image/x-wmf'
  47. else:
  48. mime = 'image/png'
  49. b64 = base64.b64encode(blob).decode('ascii')
  50. images[rel_id] = f"data:{mime};base64,{b64}"
  51. return images
  52. # ============================================================
  53. # 2. 段落/Run 格式提取
  54. # ============================================================
  55. def get_alignment(paragraph):
  56. """获取段落对齐方式"""
  57. align = paragraph.alignment
  58. if align is None:
  59. # 检查 pPr 中的 jc
  60. pPr = paragraph._element.find(qn('w:pPr'))
  61. if pPr is not None:
  62. jc = pPr.find(qn('w:jc'))
  63. if jc is not None:
  64. return jc.get(qn('w:val'))
  65. return None
  66. align_map = {
  67. WD_ALIGN_PARAGRAPH.LEFT: 'left',
  68. WD_ALIGN_PARAGRAPH.CENTER: 'center',
  69. WD_ALIGN_PARAGRAPH.RIGHT: 'right',
  70. WD_ALIGN_PARAGRAPH.JUSTIFY: 'justify',
  71. }
  72. return align_map.get(align, None)
  73. def get_paragraph_style_info(paragraph):
  74. """提取段落级别样式"""
  75. style_info = {}
  76. pf = paragraph.paragraph_format
  77. # 对齐
  78. alignment = get_alignment(paragraph)
  79. if alignment:
  80. style_info['alignment'] = alignment
  81. # 缩进
  82. if pf.left_indent:
  83. style_info['indentLeft'] = int(pf.left_indent) # EMU
  84. if pf.right_indent:
  85. style_info['indentRight'] = int(pf.right_indent)
  86. if pf.first_line_indent:
  87. val = int(pf.first_line_indent)
  88. if val > 0:
  89. style_info['indentFirstLine'] = val
  90. else:
  91. style_info['indentHanging'] = -val
  92. # 间距
  93. if pf.space_before:
  94. style_info['spacingBefore'] = int(pf.space_before)
  95. if pf.space_after:
  96. style_info['spacingAfter'] = int(pf.space_after)
  97. if pf.line_spacing:
  98. style_info['lineSpacing'] = float(pf.line_spacing)
  99. return style_info
  100. def get_run_format(run):
  101. """提取 Run 级别格式"""
  102. fmt = {}
  103. font = run.font
  104. if font.name:
  105. fmt['fontFamily'] = font.name
  106. if font.size:
  107. fmt['fontSize'] = font.size.pt
  108. if font.bold:
  109. fmt['bold'] = True
  110. if font.italic:
  111. fmt['italic'] = True
  112. if font.underline and font.underline is not True:
  113. fmt['underline'] = str(font.underline)
  114. elif font.underline is True:
  115. fmt['underline'] = 'single'
  116. if font.strike:
  117. fmt['strikeThrough'] = True
  118. if font.color and font.color.rgb:
  119. fmt['color'] = str(font.color.rgb)
  120. try:
  121. if font.highlight_color:
  122. fmt['highlightColor'] = str(font.highlight_color)
  123. except (ValueError, KeyError):
  124. pass # skip unsupported highlight values like 'none'
  125. # 上下标
  126. if font.superscript:
  127. fmt['verticalAlign'] = 'superscript'
  128. elif font.subscript:
  129. fmt['verticalAlign'] = 'subscript'
  130. return fmt
  131. def detect_paragraph_type(paragraph):
  132. """检测段落类型(标题、目录、正文等)"""
  133. style_name = paragraph.style.name if paragraph.style else ''
  134. if style_name.startswith('Heading') or style_name.startswith('heading'):
  135. level = re.search(r'\d+', style_name)
  136. if level:
  137. return f'heading{level.group()}'
  138. return 'heading1'
  139. if style_name.startswith('toc ') or style_name.startswith('TOC'):
  140. level = re.search(r'\d+', style_name)
  141. lvl = level.group() if level else '1'
  142. return f'toc{lvl}'
  143. if style_name.startswith('List'):
  144. return 'list_item'
  145. # 检查是否是标题样式(通过 run 格式推断)
  146. text = paragraph.text.strip()
  147. if text and paragraph.runs:
  148. first_run = paragraph.runs[0]
  149. if first_run.font.bold and first_run.font.size:
  150. size_pt = first_run.font.size.pt
  151. if size_pt >= 18:
  152. return 'heading1'
  153. elif size_pt >= 16:
  154. return 'heading2'
  155. elif size_pt >= 14:
  156. # 检查是否是章节标题(如 "1 企业概述")
  157. if re.match(r'^\d+(\.\d+)*\s', text):
  158. dots = text.split()[0].count('.')
  159. if dots == 0:
  160. return 'heading1'
  161. elif dots == 1:
  162. return 'heading2'
  163. else:
  164. return 'heading3'
  165. return 'paragraph'
  166. # ============================================================
  167. # 3. 检查段落中的内联图片
  168. # ============================================================
  169. def get_paragraph_images(paragraph, images_map):
  170. """检查段落中是否包含内联图片,返回图片列表"""
  171. inline_images = []
  172. for run in paragraph.runs:
  173. run_xml = run._element
  174. drawings = run_xml.findall(qn('w:drawing'))
  175. for drawing in drawings:
  176. # 查找 blip (图片引用)
  177. blips = drawing.findall('.//' + qn('a:blip'))
  178. for blip in blips:
  179. embed = blip.get(qn('r:embed'))
  180. if embed and embed in images_map:
  181. # 获取图片尺寸
  182. extent = drawing.find('.//' + qn('wp:extent'))
  183. width = height = None
  184. if extent is not None:
  185. cx = extent.get('cx')
  186. cy = extent.get('cy')
  187. if cx:
  188. width = int(cx) / 914400 # EMU to inches, then to px approx
  189. if cy:
  190. height = int(cy) / 914400
  191. inline_images.append({
  192. 'rId': embed,
  193. 'src': images_map[embed],
  194. 'widthInch': round(width, 2) if width else None,
  195. 'heightInch': round(height, 2) if height else None,
  196. })
  197. return inline_images
  198. # ============================================================
  199. # 4. 表格提取
  200. # ============================================================
  201. def parse_table(table):
  202. """解析表格为结构化数据"""
  203. rows_data = []
  204. for row in table.rows:
  205. cells_data = []
  206. for cell in row.cells:
  207. cell_text = cell.text.strip()
  208. # 检查合并
  209. tc = cell._tc
  210. grid_span = tc.find(qn('w:tcPr'))
  211. colspan = 1
  212. if grid_span is not None:
  213. gs = grid_span.find(qn('w:gridSpan'))
  214. if gs is not None:
  215. colspan = int(gs.get(qn('w:val'), 1))
  216. # 单元格内段落格式
  217. paras = []
  218. for p in cell.paragraphs:
  219. runs = []
  220. for r in p.runs:
  221. run_data = {'text': r.text}
  222. fmt = get_run_format(r)
  223. if fmt:
  224. run_data['format'] = fmt
  225. runs.append(run_data)
  226. if runs:
  227. para_data = {'runs': runs}
  228. align = get_alignment(p)
  229. if align:
  230. para_data['alignment'] = align
  231. paras.append(para_data)
  232. cell_data = {
  233. 'text': cell_text,
  234. 'colspan': colspan,
  235. }
  236. if paras:
  237. cell_data['paragraphs'] = paras
  238. cells_data.append(cell_data)
  239. rows_data.append(cells_data)
  240. return {
  241. 'rows': len(table.rows),
  242. 'cols': len(table.columns),
  243. 'data': rows_data,
  244. }
  245. # ============================================================
  246. # 5. 按文档 XML 顺序遍历(段落+表格交错)
  247. # ============================================================
  248. def iter_block_items(doc):
  249. """
  250. 按文档 body 中的顺序迭代段落和表格。
  251. 返回 (type, item) 元组,type 为 'paragraph' 或 'table'。
  252. """
  253. body = doc.element.body
  254. for child in body:
  255. if child.tag == qn('w:p'):
  256. yield ('paragraph', docx.text.paragraph.Paragraph(child, doc))
  257. elif child.tag == qn('w:tbl'):
  258. yield ('table', docx.table.Table(child, doc))
  259. elif child.tag == qn('w:sectPr'):
  260. pass # section properties, skip
  261. # ============================================================
  262. # 6. 主解析函数
  263. # ============================================================
  264. def parse_document(docx_path):
  265. """解析 Word 文档,返回结构化 JSON"""
  266. doc = Document(docx_path)
  267. # 提取所有图片
  268. images_map = extract_images(doc)
  269. print(f" 提取图片: {len(images_map)} 张")
  270. # 页面设置
  271. section = doc.sections[0]
  272. page_info = {
  273. 'widthMm': round(section.page_width.mm, 1) if section.page_width else 210,
  274. 'heightMm': round(section.page_height.mm, 1) if section.page_height else 297,
  275. 'marginTopMm': round(section.top_margin.mm, 1) if section.top_margin else 25.4,
  276. 'marginBottomMm': round(section.bottom_margin.mm, 1) if section.bottom_margin else 25.4,
  277. 'marginLeftMm': round(section.left_margin.mm, 1) if section.left_margin else 31.8,
  278. 'marginRightMm': round(section.right_margin.mm, 1) if section.right_margin else 31.8,
  279. }
  280. blocks = []
  281. block_id = 0
  282. for block_type, item in iter_block_items(doc):
  283. if block_type == 'paragraph':
  284. paragraph = item
  285. text = paragraph.text
  286. # 检测段落类型
  287. para_type = detect_paragraph_type(paragraph)
  288. # 段落样式
  289. style_info = get_paragraph_style_info(paragraph)
  290. # 检查内联图片
  291. inline_imgs = get_paragraph_images(paragraph, images_map)
  292. # 提取 runs
  293. runs = []
  294. for r in paragraph.runs:
  295. run_text = r.text
  296. if not run_text:
  297. continue
  298. run_data = {'text': run_text}
  299. fmt = get_run_format(r)
  300. if fmt:
  301. run_data.update(fmt)
  302. runs.append(run_data)
  303. block = {
  304. 'id': f'b{block_id}',
  305. 'type': para_type,
  306. }
  307. if runs:
  308. block['runs'] = runs
  309. if style_info:
  310. block['style'] = style_info
  311. if inline_imgs:
  312. block['images'] = inline_imgs
  313. # 即使空段落也保留(用于间距)
  314. if not runs and not inline_imgs:
  315. block['runs'] = [{'text': ''}]
  316. blocks.append(block)
  317. block_id += 1
  318. elif block_type == 'table':
  319. table_data = parse_table(item)
  320. block = {
  321. 'id': f'b{block_id}',
  322. 'type': 'table',
  323. 'table': table_data,
  324. }
  325. blocks.append(block)
  326. block_id += 1
  327. print(f" 解析完成: {len(blocks)} 个块")
  328. return {
  329. 'page': page_info,
  330. 'blocks': blocks,
  331. 'totalBlocks': len(blocks),
  332. }
  333. # ============================================================
  334. # 7. 插入数据库
  335. # ============================================================
  336. def insert_to_db(doc_json):
  337. """将解析后的文档内容插入数据库"""
  338. import psycopg2
  339. conn = psycopg2.connect(
  340. host='127.0.0.1',
  341. port=5432,
  342. dbname='lingyue_zhibao',
  343. user='postgres',
  344. password='postgres'
  345. )
  346. cur = conn.cursor()
  347. try:
  348. # 1. 创建附件节点 (原文.docx)
  349. att_node_key = 'ATT-2024-003'
  350. att_name = '原文-复审报告'
  351. # 检查是否已存在
  352. cur.execute("SELECT id FROM nodes WHERE node_key = %s AND node_type = 'ATTACHMENT'", (att_node_key,))
  353. row = cur.fetchone()
  354. if row:
  355. att_id = row[0]
  356. print(f" 附件节点已存在: id={att_id}")
  357. # 更新 doc_content 属性
  358. cur.execute("""
  359. INSERT INTO node_properties (node_id, prop_key, prop_json)
  360. VALUES (%s, 'doc_content', %s::jsonb)
  361. ON CONFLICT (node_id, prop_key)
  362. DO UPDATE SET prop_json = EXCLUDED.prop_json, updated_at = now()
  363. """, (att_id, json.dumps(doc_json, ensure_ascii=False)))
  364. else:
  365. # 获取下一个 id
  366. cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM nodes WHERE id >= 400 AND id < 500")
  367. att_id = cur.fetchone()[0]
  368. if att_id < 402:
  369. att_id = 402
  370. cur.execute("""
  371. INSERT INTO nodes (id, node_type, node_key, name, status, created_by, created_at, updated_at)
  372. VALUES (%s, 'ATTACHMENT', %s, %s, 'active', 1, now(), now())
  373. """, (att_id, att_node_key, att_name))
  374. print(f" 创建附件节点: id={att_id}")
  375. # 插入基本属性
  376. cur.execute("""
  377. INSERT INTO node_properties (node_id, prop_key, prop_value) VALUES
  378. (%s, 'display_name', '原文.docx'),
  379. (%s, 'file_type', 'docx'),
  380. (%s, 'file_size', '3538608')
  381. """, (att_id, att_id, att_id))
  382. # 插入文档内容
  383. cur.execute("""
  384. INSERT INTO node_properties (node_id, prop_key, prop_json)
  385. VALUES (%s, 'doc_content', %s::jsonb)
  386. """, (att_id, json.dumps(doc_json, ensure_ascii=False)))
  387. # 创建边:PROJECT -> ATTACHMENT
  388. cur.execute("""
  389. INSERT INTO edges (from_node_id, to_node_id, edge_type)
  390. SELECT 10, %s, 'HAS_ATTACHMENT'
  391. WHERE NOT EXISTS (
  392. SELECT 1 FROM edges WHERE from_node_id = 10 AND to_node_id = %s AND edge_type = 'HAS_ATTACHMENT'
  393. )
  394. """, (att_id, att_id))
  395. conn.commit()
  396. print(f" 数据库插入成功")
  397. except Exception as e:
  398. conn.rollback()
  399. print(f" 数据库错误: {e}")
  400. raise
  401. finally:
  402. cur.close()
  403. conn.close()
  404. # ============================================================
  405. # Main
  406. # ============================================================
  407. if __name__ == '__main__':
  408. print("=" * 60)
  409. print("解析 原文.docx ...")
  410. print("=" * 60)
  411. doc_json = parse_document(DOCX_PATH)
  412. # 保存 JSON 到文件(调试用,不含 base64 图片)
  413. debug_json = json.loads(json.dumps(doc_json))
  414. # 统计图片大小
  415. total_img_size = 0
  416. img_count = 0
  417. for block in debug_json['blocks']:
  418. if 'images' in block:
  419. for img in block['images']:
  420. if 'src' in img:
  421. total_img_size += len(img['src'])
  422. img_count += 1
  423. print(f" 图片总大小(base64): {total_img_size / 1024 / 1024:.1f} MB ({img_count} 张)")
  424. print(f" JSON 总大小: {len(json.dumps(doc_json, ensure_ascii=False)) / 1024 / 1024:.1f} MB")
  425. print("\n插入数据库...")
  426. insert_to_db(doc_json)
  427. # 保存精简版 JSON(不含 base64)用于调试
  428. for block in debug_json['blocks']:
  429. if 'images' in block:
  430. for img in block['images']:
  431. if 'src' in img:
  432. img['src'] = img['src'][:50] + '...[truncated]'
  433. debug_path = os.path.join(os.path.dirname(__file__), "parsed_doc_debug.json")
  434. with open(debug_path, 'w', encoding='utf-8') as f:
  435. json.dump(debug_json, f, ensure_ascii=False, indent=2)
  436. print(f"\n调试 JSON 已保存: {debug_path}")
  437. print("完成!")