SmartEDT/tools/docx_to_md.py

79 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
"""将 .docx 文档转换为 Markdown。
实现思路:
- 使用 mammoth 将 docx 转为 HTML对 Word 样式有较好兼容)
- 再使用 markdownify 将 HTML 转为 Markdown
- 可选导出文档内图片到 assets 目录,并在 Markdown 中引用相对路径
"""
import argparse
import os
from pathlib import Path
def main() -> int:
"""命令行入口:执行 docx -> md 转换。"""
parser = argparse.ArgumentParser()
parser.add_argument("docx_path")
parser.add_argument("md_path")
parser.add_argument("--assets-dir", default=None)
args = parser.parse_args()
docx_path = Path(args.docx_path).expanduser().resolve()
md_path = Path(args.md_path).expanduser().resolve()
assets_dir = Path(args.assets_dir).expanduser().resolve() if args.assets_dir else None
if not docx_path.exists() or not docx_path.is_file():
raise FileNotFoundError(str(docx_path))
import mammoth
from markdownify import markdownify as md
image_index = 0
def _convert_image(image):
"""将 docx 内嵌图片写入 assets 目录,并返回 Markdown 可用的相对路径。"""
nonlocal image_index
if assets_dir is None:
# 不导出图片时,返回空 src避免把图片内容直接内联到 Markdown
return {"src": ""}
assets_dir.mkdir(parents=True, exist_ok=True)
ext = _guess_image_extension(image.content_type)
image_index += 1
name = f"image_{image_index:03d}{ext}"
target = assets_dir / name
with target.open("wb") as f:
f.write(image.read())
rel = os.path.relpath(target, md_path.parent)
rel = rel.replace("\\", "/")
return {"src": rel}
result = mammoth.convert_to_html(docx_path, convert_image=mammoth.images.img_element(_convert_image))
html = result.value
markdown = md(html, heading_style="ATX", bullets="-")
md_path.parent.mkdir(parents=True, exist_ok=True)
md_path.write_text(markdown, encoding="utf-8")
return 0
def _guess_image_extension(content_type: str) -> str:
"""根据图片的 MIME 类型推断文件扩展名。"""
mapping = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/bmp": ".bmp",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/svg+xml": ".svg",
}
return mapping.get(content_type, "")
if __name__ == "__main__":
raise SystemExit(main())