from __future__ import annotations """将 .docx 文档转换为 Markdown。 实现思路: - 使用 mammoth 将 docx 转为 HTML(对 Word 样式有较好兼容) - 再使用 markdownify 将 HTML 转为 Markdown - 可选导出文档内图片到 assets 目录,并在 Markdown 中引用相对路径 """ import argparse import os from pathlib import Path def main() -> int: """命令行入口:执行 docx -> md 转换。""" parser = argparse.ArgumentParser() parser.add_argument("docx_path") parser.add_argument("md_path") parser.add_argument("--assets-dir", default=None) args = parser.parse_args() docx_path = Path(args.docx_path).expanduser().resolve() md_path = Path(args.md_path).expanduser().resolve() assets_dir = Path(args.assets_dir).expanduser().resolve() if args.assets_dir else None if not docx_path.exists() or not docx_path.is_file(): raise FileNotFoundError(str(docx_path)) import mammoth from markdownify import markdownify as md image_index = 0 def _convert_image(image): """将 docx 内嵌图片写入 assets 目录,并返回 Markdown 可用的相对路径。""" nonlocal image_index if assets_dir is None: # 不导出图片时,返回空 src,避免把图片内容直接内联到 Markdown return {"src": ""} assets_dir.mkdir(parents=True, exist_ok=True) ext = _guess_image_extension(image.content_type) image_index += 1 name = f"image_{image_index:03d}{ext}" target = assets_dir / name with target.open("wb") as f: f.write(image.read()) rel = os.path.relpath(target, md_path.parent) rel = rel.replace("\\", "/") return {"src": rel} result = mammoth.convert_to_html(docx_path, convert_image=mammoth.images.img_element(_convert_image)) html = result.value markdown = md(html, heading_style="ATX", bullets="-") md_path.parent.mkdir(parents=True, exist_ok=True) md_path.write_text(markdown, encoding="utf-8") return 0 def _guess_image_extension(content_type: str) -> str: """根据图片的 MIME 类型推断文件扩展名。""" mapping = { "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/gif": ".gif", "image/bmp": ".bmp", "image/tiff": ".tiff", "image/webp": ".webp", "image/svg+xml": ".svg", } return mapping.get(content_type, "") if __name__ == "__main__": raise SystemExit(main())