excel-toolkit/scripts/translate_excel.py

403 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel 文件中文→英文翻译工具
使用 Google Gemini Flash Lite API 进行翻译
流程:
1. 提取:收集所有中文内容及其位置映射
2. 翻译:批量翻译所有中文内容
3. 应用:将翻译结果写入新 Excel 文件
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import sys
sys.path.insert(0, str(Path(__file__).parent))
from load_env import load_env
load_env() # 加载 .env 文件
try:
import google.generativeai as genai # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 google-generativeai请先安装uv pip install google-generativeai"
) from exc
try:
from openpyxl import load_workbook, Workbook # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 openpyxl请先安装uv pip install openpyxl"
) from exc
try:
import pandas as pd # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 pandas请先安装uv pip install pandas"
) from exc
@dataclass
class CellPosition:
"""单元格位置信息"""
sheet: str
row: int
col: int
col_letter: str = field(init=False)
def __post_init__(self):
self.col_letter = self._index_to_letter(self.col)
@staticmethod
def _index_to_letter(col: int) -> str:
result = ""
while col > 0:
col -= 1
result = chr(65 + (col % 26)) + result
col //= 26
return result
@property
def cell_ref(self) -> str:
return f"{self.col_letter}{self.row}"
@property
def full_ref(self) -> str:
return f"{self.sheet}!{self.cell_ref}"
@dataclass
class TranslationEntry:
"""翻译条目"""
position: CellPosition
original: str
translated: str = ""
def detect_chinese(text: str) -> bool:
if not text or not isinstance(text, str):
return False
return bool(re.search(r"[\u4e00-\u9fff]", text))
def get_api_key() -> str:
import os
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if api_key:
return api_key
raise ValueError(
"未找到 Gemini API 密钥。请设置环境变量 GEMINI_API_KEY 或 GOOGLE_API_KEY。"
"获取 API Key: https://aistudio.google.com/app/apikey"
)
def get_model_name() -> str:
import os
return os.getenv("GEMINI_MODEL", "gemini-2.0-flash")
def extract_chinese_content(
input_path: Path,
columns: list[str] | None = None,
sheet_name: str | None = None,
) -> tuple[list[TranslationEntry], dict[str, list[str]]]:
wb = load_workbook(input_path)
entries: list[TranslationEntry] = []
sheet_headers: dict[str, list[str]] = {}
if sheet_name:
sheet_names = [sheet_name] if sheet_name in wb.sheetnames else []
if not sheet_names:
raise ValueError(f"找不到 Sheet: {sheet_name}")
else:
sheet_names = wb.sheetnames
print(f"📄 文件:{input_path}")
print(f"📊 Sheet 列表:{sheet_names}")
for sn in sheet_names:
ws = wb[sn]
print(f"\n处理 Sheet: {sn}")
headers = []
for col in range(1, ws.max_column + 1):
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value else f"{col}")
sheet_headers[sn] = headers
print(f"表头:{headers}")
if columns:
col_indices = []
for col_name in columns:
if col_name in headers:
col_indices.append(headers.index(col_name) + 1)
else:
print(f"⚠️ 警告:列 '{col_name}' 不存在")
if not col_indices:
col_indices = list(range(1, ws.max_column + 1))
else:
col_indices = list(range(1, ws.max_column + 1))
print(f"要翻译的列索引:{col_indices}")
count = 0
for row in range(2, ws.max_row + 1):
for col in col_indices:
cell = ws.cell(row=row, column=col)
value = cell.value
if value and isinstance(value, str) and detect_chinese(value):
pos = CellPosition(sheet=sn, row=row, col=col)
entry = TranslationEntry(position=pos, original=value)
entries.append(entry)
count += 1
print(f" ✓ 发现 {count} 个中文单元格")
wb.close()
return entries, sheet_headers
def translate_entries(
entries: list[TranslationEntry],
model_name: str | None = None,
api_key: str | None = None,
) -> list[TranslationEntry]:
if model_name is None:
model_name = get_model_name()
if not entries:
print("✓ 没有需要翻译的内容")
return entries
if not api_key:
api_key = get_api_key()
originals = [entry.original for entry in entries]
print(f"\n🌐 正在翻译 {len(originals)} 个中文内容...")
print(f" 模型:{model_name}")
genai.configure(api_key=api_key)
prompt = f"""你是一个专业的翻译助手。请将以下中文文本翻译成英文。
要求:
1. 保持专业术语准确
2. 人名使用拼音(如:张三 → Zhang San
3. 公司名、产品名保持原名或标准英文名
4. 邮箱、数字、URL 等非中文内容保持不变
5. 只返回翻译结果,不要额外解释
输入文本JSON 数组格式):
{json.dumps(originals, ensure_ascii=False)}
请以 JSON 数组格式返回翻译结果,保持相同顺序。"""
try:
model = genai.GenerativeModel(model_name)
response = model.generate_content(prompt)
result_text = response.text.strip()
if result_text.startswith("```json"):
result_text = result_text[7:]
if result_text.endswith("```"):
result_text = result_text[:-3]
result_text = result_text.strip()
translations = json.loads(result_text)
if len(translations) != len(originals):
print(f"⚠️ 警告:翻译返回 {len(translations)} 条,期望 {len(originals)}")
while len(translations) < len(originals):
translations.append(originals[len(translations)])
for i, entry in enumerate(entries):
entry.translated = translations[i] if i < len(translations) else entry.original
print(f"✅ 翻译完成!")
translated_count = sum(1 for e in entries if e.translated != e.original)
print(f" 成功翻译:{translated_count}/{len(entries)}")
except Exception as e:
print(f"❌ 翻译失败:{e}")
print(f" 保留原文")
for entry in entries:
entry.translated = entry.original
return entries
def apply_translations(
input_path: Path,
output_path: Path,
entries: list[TranslationEntry],
sheet_headers: dict[str, list[str]],
) -> Path:
print(f"\n💾 保存翻译结果到:{output_path}")
wb = load_workbook(input_path)
applied_count = 0
for entry in entries:
if entry.translated and entry.translated != entry.original:
ws = wb[entry.position.sheet]
ws.cell(row=entry.position.row, column=entry.position.col).value = entry.translated
applied_count += 1
wb.save(output_path)
wb.close()
print(f"✅ 完成!共更新 {applied_count} 个单元格")
print(f"\n📋 翻译预览(前 10 条):")
for i, entry in enumerate(entries[:10]):
if entry.translated != entry.original:
print(f" {entry.position.full_ref}: {entry.original}{entry.translated}")
if len(entries) > 10:
print(f" ... 还有 {len(entries) - 10}")
return output_path
def translate_excel_file(
input_path: Path,
output_path: Path | None = None,
columns: list[str] | None = None,
sheet_name: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
dry_run: bool = False,
) -> Path:
if model_name is None:
model_name = get_model_name()
if not output_path:
output_path = input_path.parent / f"{input_path.stem}_en{input_path.suffix}"
print("=" * 60)
print("Excel 中文→英文翻译工具")
print("=" * 60)
print("\n【阶段 1/3】提取中文内容...")
entries, sheet_headers = extract_chinese_content(
input_path=input_path,
columns=columns,
sheet_name=sheet_name,
)
if not entries:
print("\n✓ 没有发现中文内容,无需翻译")
return input_path
print(f"\n共发现 {len(entries)} 个中文单元格")
if dry_run:
print(f"\n[预览模式] 将翻译以下内容:")
for i, entry in enumerate(entries[:20]):
print(f" {entry.position.full_ref}: {entry.original}")
if len(entries) > 20:
print(f" ... 还有 {len(entries) - 20}")
return input_path
print("\n【阶段 2/3】批量翻译...")
entries = translate_entries(
entries=entries,
model_name=model_name,
api_key=api_key,
)
print("\n【阶段 3/3】应用翻译结果...")
output_path = apply_translations(
input_path=input_path,
output_path=output_path,
entries=entries,
sheet_headers=sheet_headers,
)
print("\n" + "=" * 60)
print("✅ 翻译完成!")
print("=" * 60)
return output_path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Excel 文件中文→英文翻译工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 翻译整个文件
python translate_excel.py --file data.xlsx
# 预览模式
python translate_excel.py --file data.xlsx --dry-run
# 指定列翻译
python translate_excel.py --file data.xlsx --columns "姓名,地址"
# 指定 Sheet 翻译
python translate_excel.py --file data.xlsx --sheet "Sheet1"
环境变量:
GEMINI_API_KEY: Gemini API 密钥
GOOGLE_API_KEY: Google API 密钥(备选)
获取 API Key: https://aistudio.google.com/app/apikey
""",
)
parser.add_argument("--file", "-f", type=Path, required=True, help="输入 Excel 文件路径")
parser.add_argument("--output", "-o", type=Path, help="输出文件路径")
parser.add_argument("--columns", "-c", type=str, help="要翻译的列名(逗号分隔)")
parser.add_argument("--sheet", "-s", type=str, help="要翻译的 Sheet 名称")
parser.add_argument("--model", "-m", type=str, default="gemini-2.0-flash", help="Gemini 模型名称")
parser.add_argument("--api-key", "-k", type=str, help="Gemini API 密钥")
parser.add_argument("--dry-run", action="store_true", help="预览模式")
return parser.parse_args()
def main() -> int:
args = parse_args()
if not args.file.exists():
print(f"❌ 文件不存在:{args.file}", file=sys.stderr)
return 1
try:
columns = None
if args.columns:
columns = [c.strip() for c in args.columns.split(",")]
translate_excel_file(
input_path=args.file,
output_path=args.output,
columns=columns,
sheet_name=args.sheet,
model_name=args.model,
api_key=args.api_key,
dry_run=args.dry_run,
)
return 0
except Exception as e:
print(f"❌ 错误:{e}", file=sys.stderr)
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())