excel-toolkit/scripts/translate_excel.py

403 lines
12 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel 文件中文英文翻译工具
使用 Google Gemini Flash Lite API 进行翻译
流程
1. 提取收集所有中文内容及其位置映射
2. 翻译批量翻译所有中文内容
3. 应用将翻译结果写入新 Excel 文件
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import sys
sys.path.insert(0, str(Path(__file__).parent))
from load_env import load_env
load_env() # 加载 .env 文件
try:
import google.generativeai as genai # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 google-generativeai请先安装uv pip install google-generativeai"
) from exc
try:
from openpyxl import load_workbook, Workbook # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 openpyxl请先安装uv pip install openpyxl"
) from exc
try:
import pandas as pd # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 pandas请先安装uv pip install pandas"
) from exc
@dataclass
class CellPosition:
"""单元格位置信息"""
sheet: str
row: int
col: int
col_letter: str = field(init=False)
def __post_init__(self):
self.col_letter = self._index_to_letter(self.col)
@staticmethod
def _index_to_letter(col: int) -> str:
result = ""
while col > 0:
col -= 1
result = chr(65 + (col % 26)) + result
col //= 26
return result
@property
def cell_ref(self) -> str:
return f"{self.col_letter}{self.row}"
@property
def full_ref(self) -> str:
return f"{self.sheet}!{self.cell_ref}"
@dataclass
class TranslationEntry:
"""翻译条目"""
position: CellPosition
original: str
translated: str = ""
def detect_chinese(text: str) -> bool:
if not text or not isinstance(text, str):
return False
return bool(re.search(r"[\u4e00-\u9fff]", text))
def get_api_key() -> str:
import os
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if api_key:
return api_key
raise ValueError(
"未找到 Gemini API 密钥。请设置环境变量 GEMINI_API_KEY 或 GOOGLE_API_KEY。"
"获取 API Key: https://aistudio.google.com/app/apikey"
)
def get_model_name() -> str:
import os
return os.getenv("GEMINI_MODEL", "gemini-2.0-flash-lite")
def extract_chinese_content(
input_path: Path,
columns: list[str] | None = None,
sheet_name: str | None = None,
) -> tuple[list[TranslationEntry], dict[str, list[str]]]:
wb = load_workbook(input_path)
entries: list[TranslationEntry] = []
sheet_headers: dict[str, list[str]] = {}
if sheet_name:
sheet_names = [sheet_name] if sheet_name in wb.sheetnames else []
if not sheet_names:
raise ValueError(f"找不到 Sheet: {sheet_name}")
else:
sheet_names = wb.sheetnames
print(f"📄 文件:{input_path}")
print(f"📊 Sheet 列表:{sheet_names}")
for sn in sheet_names:
ws = wb[sn]
print(f"\n处理 Sheet: {sn}")
headers = []
for col in range(1, ws.max_column + 1):
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value else f"{col}")
sheet_headers[sn] = headers
print(f"表头:{headers}")
if columns:
col_indices = []
for col_name in columns:
if col_name in headers:
col_indices.append(headers.index(col_name) + 1)
else:
print(f"⚠️ 警告:列 '{col_name}' 不存在")
if not col_indices:
col_indices = list(range(1, ws.max_column + 1))
else:
col_indices = list(range(1, ws.max_column + 1))
print(f"要翻译的列索引:{col_indices}")
count = 0
for row in range(2, ws.max_row + 1):
for col in col_indices:
cell = ws.cell(row=row, column=col)
value = cell.value
if value and isinstance(value, str) and detect_chinese(value):
pos = CellPosition(sheet=sn, row=row, col=col)
entry = TranslationEntry(position=pos, original=value)
entries.append(entry)
count += 1
print(f" ✓ 发现 {count} 个中文单元格")
wb.close()
return entries, sheet_headers
def translate_entries(
entries: list[TranslationEntry],
model_name: str | None = None,
api_key: str | None = None,
) -> list[TranslationEntry]:
if model_name is None:
model_name = get_model_name()
if not entries:
print("✓ 没有需要翻译的内容")
return entries
if not api_key:
api_key = get_api_key()
originals = [entry.original for entry in entries]
print(f"\n🌐 正在翻译 {len(originals)} 个中文内容...")
print(f" 模型:{model_name}")
genai.configure(api_key=api_key)
prompt = f"""你是一个专业的翻译助手。请将以下中文文本翻译成英文。
要求
1. 保持专业术语准确
2. 人名使用拼音张三 Zhang San
3. 公司名产品名保持原名或标准英文名
4. 邮箱数字URL 等非中文内容保持不变
5. 只返回翻译结果不要额外解释
输入文本JSON 数组格式
{json.dumps(originals, ensure_ascii=False)}
请以 JSON 数组格式返回翻译结果保持相同顺序"""
try:
model = genai.GenerativeModel(model_name)
response = model.generate_content(prompt)
result_text = response.text.strip()
if result_text.startswith("```json"):
result_text = result_text[7:]
if result_text.endswith("```"):
result_text = result_text[:-3]
result_text = result_text.strip()
translations = json.loads(result_text)
if len(translations) != len(originals):
print(f"⚠️ 警告:翻译返回 {len(translations)} 条,期望 {len(originals)}")
while len(translations) < len(originals):
translations.append(originals[len(translations)])
for i, entry in enumerate(entries):
entry.translated = translations[i] if i < len(translations) else entry.original
print(f"✅ 翻译完成!")
translated_count = sum(1 for e in entries if e.translated != e.original)
print(f" 成功翻译:{translated_count}/{len(entries)}")
except Exception as e:
print(f"❌ 翻译失败:{e}")
print(f" 保留原文")
for entry in entries:
entry.translated = entry.original
return entries
def apply_translations(
input_path: Path,
output_path: Path,
entries: list[TranslationEntry],
sheet_headers: dict[str, list[str]],
) -> Path:
print(f"\n💾 保存翻译结果到:{output_path}")
wb = load_workbook(input_path)
applied_count = 0
for entry in entries:
if entry.translated and entry.translated != entry.original:
ws = wb[entry.position.sheet]
ws.cell(row=entry.position.row, column=entry.position.col).value = entry.translated
applied_count += 1
wb.save(output_path)
wb.close()
print(f"✅ 完成!共更新 {applied_count} 个单元格")
print(f"\n📋 翻译预览(前 10 条):")
for i, entry in enumerate(entries[:10]):
if entry.translated != entry.original:
print(f" {entry.position.full_ref}: {entry.original}{entry.translated}")
if len(entries) > 10:
print(f" ... 还有 {len(entries) - 10}")
return output_path
def translate_excel_file(
input_path: Path,
output_path: Path | None = None,
columns: list[str] | None = None,
sheet_name: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
dry_run: bool = False,
) -> Path:
if model_name is None:
model_name = get_model_name()
if not output_path:
output_path = input_path.parent / f"{input_path.stem}_en{input_path.suffix}"
print("=" * 60)
print("Excel 中文→英文翻译工具")
print("=" * 60)
print("\n【阶段 1/3】提取中文内容...")
entries, sheet_headers = extract_chinese_content(
input_path=input_path,
columns=columns,
sheet_name=sheet_name,
)
if not entries:
print("\n✓ 没有发现中文内容,无需翻译")
return input_path
print(f"\n共发现 {len(entries)} 个中文单元格")
if dry_run:
print(f"\n[预览模式] 将翻译以下内容:")
for i, entry in enumerate(entries[:20]):
print(f" {entry.position.full_ref}: {entry.original}")
if len(entries) > 20:
print(f" ... 还有 {len(entries) - 20}")
return input_path
print("\n【阶段 2/3】批量翻译...")
entries = translate_entries(
entries=entries,
model_name=model_name,
api_key=api_key,
)
print("\n【阶段 3/3】应用翻译结果...")
output_path = apply_translations(
input_path=input_path,
output_path=output_path,
entries=entries,
sheet_headers=sheet_headers,
)
print("\n" + "=" * 60)
print("✅ 翻译完成!")
print("=" * 60)
return output_path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Excel 文件中文→英文翻译工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 翻译整个文件
python translate_excel.py --file data.xlsx
# 预览模式
python translate_excel.py --file data.xlsx --dry-run
# 指定列翻译
python translate_excel.py --file data.xlsx --columns "姓名,地址"
# 指定 Sheet 翻译
python translate_excel.py --file data.xlsx --sheet "Sheet1"
环境变量:
GEMINI_API_KEY: Gemini API 密钥
GOOGLE_API_KEY: Google API 密钥备选
获取 API Key: https://aistudio.google.com/app/apikey
""",
)
parser.add_argument("--file", "-f", type=Path, required=True, help="输入 Excel 文件路径")
parser.add_argument("--output", "-o", type=Path, help="输出文件路径")
parser.add_argument("--columns", "-c", type=str, help="要翻译的列名(逗号分隔)")
parser.add_argument("--sheet", "-s", type=str, help="要翻译的 Sheet 名称")
parser.add_argument("--model", "-m", type=str, default="gemini-2.0-flash-lite", help="Gemini 模型名称")
parser.add_argument("--api-key", "-k", type=str, help="Gemini API 密钥")
parser.add_argument("--dry-run", action="store_true", help="预览模式")
return parser.parse_args()
def main() -> int:
args = parse_args()
if not args.file.exists():
print(f"❌ 文件不存在:{args.file}", file=sys.stderr)
return 1
try:
columns = None
if args.columns:
columns = [c.strip() for c in args.columns.split(",")]
translate_excel_file(
input_path=args.file,
output_path=args.output,
columns=columns,
sheet_name=args.sheet,
model_name=args.model,
api_key=args.api_key,
dry_run=args.dry_run,
)
return 0
except Exception as e:
print(f"❌ 错误:{e}", file=sys.stderr)
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())