SmartEDT/tools/docx_to_md.py

79 lines
2.5 KiB
Python
Raw Normal View History

from __future__ import annotations
"""将 .docx 文档转换为 Markdown。
实现思路
- 使用 mammoth docx 转为 HTML Word 样式有较好兼容
- 再使用 markdownify HTML 转为 Markdown
- 可选导出文档内图片到 assets 目录并在 Markdown 中引用相对路径
"""
import argparse
import os
from pathlib import Path
def main() -> int:
"""命令行入口:执行 docx -> md 转换。"""
parser = argparse.ArgumentParser()
parser.add_argument("docx_path")
parser.add_argument("md_path")
parser.add_argument("--assets-dir", default=None)
args = parser.parse_args()
docx_path = Path(args.docx_path).expanduser().resolve()
md_path = Path(args.md_path).expanduser().resolve()
assets_dir = Path(args.assets_dir).expanduser().resolve() if args.assets_dir else None
if not docx_path.exists() or not docx_path.is_file():
raise FileNotFoundError(str(docx_path))
import mammoth
from markdownify import markdownify as md
image_index = 0
def _convert_image(image):
"""将 docx 内嵌图片写入 assets 目录,并返回 Markdown 可用的相对路径。"""
nonlocal image_index
if assets_dir is None:
# 不导出图片时,返回空 src避免把图片内容直接内联到 Markdown
return {"src": ""}
assets_dir.mkdir(parents=True, exist_ok=True)
ext = _guess_image_extension(image.content_type)
image_index += 1
name = f"image_{image_index:03d}{ext}"
target = assets_dir / name
with target.open("wb") as f:
f.write(image.read())
rel = os.path.relpath(target, md_path.parent)
rel = rel.replace("\\", "/")
return {"src": rel}
result = mammoth.convert_to_html(docx_path, convert_image=mammoth.images.img_element(_convert_image))
html = result.value
markdown = md(html, heading_style="ATX", bullets="-")
md_path.parent.mkdir(parents=True, exist_ok=True)
md_path.write_text(markdown, encoding="utf-8")
return 0
def _guess_image_extension(content_type: str) -> str:
"""根据图片的 MIME 类型推断文件扩展名。"""
mapping = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/bmp": ".bmp",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/svg+xml": ".svg",
}
return mapping.get(content_type, "")
if __name__ == "__main__":
raise SystemExit(main())