79 lines
2.5 KiB
Python
79 lines
2.5 KiB
Python
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
"""将 .docx 文档转换为 Markdown。
|
|||
|
|
|
|||
|
|
实现思路:
|
|||
|
|
- 使用 mammoth 将 docx 转为 HTML(对 Word 样式有较好兼容)
|
|||
|
|
- 再使用 markdownify 将 HTML 转为 Markdown
|
|||
|
|
- 可选导出文档内图片到 assets 目录,并在 Markdown 中引用相对路径
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> int:
|
|||
|
|
"""命令行入口:执行 docx -> md 转换。"""
|
|||
|
|
parser = argparse.ArgumentParser()
|
|||
|
|
parser.add_argument("docx_path")
|
|||
|
|
parser.add_argument("md_path")
|
|||
|
|
parser.add_argument("--assets-dir", default=None)
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
docx_path = Path(args.docx_path).expanduser().resolve()
|
|||
|
|
md_path = Path(args.md_path).expanduser().resolve()
|
|||
|
|
assets_dir = Path(args.assets_dir).expanduser().resolve() if args.assets_dir else None
|
|||
|
|
|
|||
|
|
if not docx_path.exists() or not docx_path.is_file():
|
|||
|
|
raise FileNotFoundError(str(docx_path))
|
|||
|
|
|
|||
|
|
import mammoth
|
|||
|
|
from markdownify import markdownify as md
|
|||
|
|
|
|||
|
|
image_index = 0
|
|||
|
|
|
|||
|
|
def _convert_image(image):
|
|||
|
|
"""将 docx 内嵌图片写入 assets 目录,并返回 Markdown 可用的相对路径。"""
|
|||
|
|
nonlocal image_index
|
|||
|
|
if assets_dir is None:
|
|||
|
|
# 不导出图片时,返回空 src,避免把图片内容直接内联到 Markdown
|
|||
|
|
return {"src": ""}
|
|||
|
|
assets_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
ext = _guess_image_extension(image.content_type)
|
|||
|
|
image_index += 1
|
|||
|
|
name = f"image_{image_index:03d}{ext}"
|
|||
|
|
target = assets_dir / name
|
|||
|
|
with target.open("wb") as f:
|
|||
|
|
f.write(image.read())
|
|||
|
|
rel = os.path.relpath(target, md_path.parent)
|
|||
|
|
rel = rel.replace("\\", "/")
|
|||
|
|
return {"src": rel}
|
|||
|
|
|
|||
|
|
result = mammoth.convert_to_html(docx_path, convert_image=mammoth.images.img_element(_convert_image))
|
|||
|
|
html = result.value
|
|||
|
|
markdown = md(html, heading_style="ATX", bullets="-")
|
|||
|
|
|
|||
|
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
md_path.write_text(markdown, encoding="utf-8")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _guess_image_extension(content_type: str) -> str:
|
|||
|
|
"""根据图片的 MIME 类型推断文件扩展名。"""
|
|||
|
|
mapping = {
|
|||
|
|
"image/png": ".png",
|
|||
|
|
"image/jpeg": ".jpg",
|
|||
|
|
"image/jpg": ".jpg",
|
|||
|
|
"image/gif": ".gif",
|
|||
|
|
"image/bmp": ".bmp",
|
|||
|
|
"image/tiff": ".tiff",
|
|||
|
|
"image/webp": ".webp",
|
|||
|
|
"image/svg+xml": ".svg",
|
|||
|
|
}
|
|||
|
|
return mapping.get(content_type, "")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
raise SystemExit(main())
|