excel-toolkit/scripts/translate_excel.py

360 lines
11 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel 文件中文→英文翻译工具
使用 Google Gemini Flash Lite API 进行翻译
"""
from __future__ import annotations
import argparse
import json
import math
import re
import sys
from pathlib import Path
from typing import Any
try:
from google import genai
from google.genai import types # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 google-generativeai请先安装uv pip install google-generativeai"
) from exc
try:
from openpyxl import load_workbook, Workbook # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 openpyxl请先安装uv pip install openpyxl"
) from exc
try:
import pandas as pd # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 pandas请先安装uv pip install pandas"
) from exc
def detect_chinese(text: str) -> bool:
"""检测文本中是否包含中文字符"""
if not text or not isinstance(text, str):
return False
return bool(re.search(r"[\u4e00-\u9fff]", text))
def format_cell_value(value: Any) -> Any:
"""格式化单元格值,保持原始类型"""
if value is None:
return None
if isinstance(value, float) and math.isnan(value):
return None
return value
def get_api_key() -> str:
"""获取 Gemini API 密钥"""
import os
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if api_key:
return api_key
raise ValueError(
"未找到 Gemini API 密钥。请设置环境变量 GEMINI_API_KEY 或 GOOGLE_API_KEY。"
"获取 API Key: https://aistudio.google.com/app/apikey"
)
def translate_batch(
texts: list[str],
model_name: str = "gemini-2.0-flash-lite",
api_key: str | None = None,
) -> dict[str, str]:
"""
批量翻译文本(使用 Gemini Deep Research
Args:
texts: 待翻译的文本列表
model_name: 使用的模型名称
api_key: API 密钥(可选)
Returns:
原文到译文的映射字典
"""
if not texts:
return {}
# 过滤掉空文本和不含中文的文本
chinese_texts = [(i, t) for i, t in enumerate(texts) if t and t.strip() and detect_chinese(t)]
if not chinese_texts:
return {}
# 配置 API
if not api_key:
api_key = get_api_key()
client = genai.Client(api_key=api_key)
# 构建批量翻译请求
translation_pairs = []
for _, text in chinese_texts:
translation_pairs.append(f'"{text}"')
# 使用 Deep Research 进行翻译
prompt = f"""你是一个专业的翻译助手。请将以下中文文本翻译成英文。
要求:
1. 保持专业术语准确
2. 人名使用拼音(如:张三 → Zhang San
3. 公司名、产品名保持原名或标准英文名
4. 邮箱、数字等非中文内容保持不变
5. 只返回翻译结果,不要额外解释
输入文本JSON 数组格式):
{json.dumps([t for _, t in chinese_texts], ensure_ascii=False)}
请以 JSON 数组格式返回翻译结果,保持相同顺序。"""
try:
response = client.models.generate_content(
model=model_name,
contents=prompt,
config=types.GenerateContentConfig(
temperature=0.3,
top_p=0.8,
)
)
# 解析响应
result_text = response.text.strip()
# 尝试解析 JSON
if result_text.startswith("```json"):
result_text = result_text[7:]
if result_text.endswith("```"):
result_text = result_text[:-3]
result_text = result_text.strip()
translations = json.loads(result_text)
# 构建映射字典
result = {}
for idx, (_, original) in enumerate(chinese_texts):
if idx < len(translations):
result[original] = translations[idx]
else:
result[original] = original # 翻译失败时保留原文
return result
except Exception as e:
print(f"⚠️ 翻译失败:{e}", file=sys.stderr)
# 翻译失败时返回原文
return {text: text for _, text in chinese_texts}
def translate_excel_file(
input_path: Path,
output_path: Path | None = None,
columns: list[str] | None = None,
sheet_name: str | None = None,
model_name: str = "gemini-2.0-flash-lite",
api_key: str | None = None,
dry_run: bool = False,
) -> Path:
"""
翻译 Excel 文件中的中文内容
Args:
input_path: 输入文件路径
output_path: 输出文件路径(默认生成 {原文件名}_en.xlsx
columns: 指定要翻译的列名列表
sheet_name: 指定要翻译的 Sheet 名称
model_name: Gemini 模型名称
api_key: API 密钥
dry_run: 预览模式,不实际生成文件
Returns:
输出文件路径
"""
if not output_path:
output_path = input_path.parent / f"{input_path.stem}_en{input_path.suffix}"
# 加载工作簿
wb = load_workbook(input_path)
# 确定要处理的 Sheet 列表
if sheet_name:
sheet_names = [sheet_name] if sheet_name in wb.sheetnames else []
if not sheet_names:
raise ValueError(f"找不到 Sheet: {sheet_name}")
else:
sheet_names = wb.sheetnames
print(f"📄 文件:{input_path}")
print(f"📊 Sheet 列表:{sheet_names}")
total_cells = 0
translated_cells = 0
for sn in sheet_names:
ws = wb[sn]
print(f"\n处理 Sheet: {sn}")
# 获取表头
headers = []
for col in range(1, ws.max_column + 1):
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value else f"{col}")
print(f"表头:{headers}")
# 确定要翻译的列索引
if columns:
col_indices = []
for col_name in columns:
if col_name in headers:
col_indices.append(headers.index(col_name) + 1)
else:
print(f"⚠️ 警告:列 '{col_name}' 不存在")
if not col_indices:
col_indices = list(range(1, ws.max_column + 1))
else:
col_indices = list(range(1, ws.max_column + 1))
print(f"要翻译的列索引:{col_indices}")
# 收集所有需要翻译的单元格内容
texts_to_translate = []
cell_positions = [] # (row, col)
for row in range(2, ws.max_row + 1): # 跳过表头
for col in col_indices:
cell = ws.cell(row=row, column=col)
value = cell.value
if value and isinstance(value, str) and detect_chinese(value):
texts_to_translate.append(value)
cell_positions.append((row, col))
total_cells += 1
if not texts_to_translate:
print(" ✓ 没有需要翻译的中文内容")
continue
print(f" 发现 {len(texts_to_translate)} 个中文单元格")
if dry_run:
print(f" [预览模式] 将翻译以下内容:")
for i, text in enumerate(texts_to_translate[:10]): # 只显示前 10 个
print(f" {cell_positions[i]}: {text}")
if len(texts_to_translate) > 10:
print(f" ... 还有 {len(texts_to_translate) - 10}")
continue
# 批量翻译
print(f" 正在翻译...")
translations = translate_batch(texts_to_translate, model_name, api_key)
# 应用翻译结果
for i, (row, col) in enumerate(cell_positions):
original = texts_to_translate[i]
translated = translations.get(original, original)
ws.cell(row=row, column=col).value = translated
translated_cells += 1
print(f" ✓ 完成翻译 {translated_cells} 个单元格")
if dry_run:
print(f"\n[预览模式] 共发现 {total_cells} 个中文单元格需要翻译")
return input_path
# 保存新文件
wb.save(output_path)
print(f"\n✅ 翻译完成!输出文件:{output_path}")
print(f"📊 统计:共处理 {total_cells} 个单元格,翻译 {translated_cells} 个中文内容")
return output_path
def parse_args() -> argparse.Namespace:
"""解析命令行参数"""
parser = argparse.ArgumentParser(
description="Excel 文件中文→英文翻译工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 翻译整个文件
python translate_excel.py --file data.xlsx
# 预览模式
python translate_excel.py --file data.xlsx --dry-run
# 指定列翻译
python translate_excel.py --file data.xlsx --columns "姓名,地址"
# 指定 Sheet 翻译
python translate_excel.py --file data.xlsx --sheet "Sheet1"
# 指定输出文件
python translate_excel.py --file data.xlsx --output translated.xlsx
# 指定 API Key
python translate_excel.py --file data.xlsx --api-key YOUR_API_KEY
环境变量:
GEMINI_API_KEY: Gemini API 密钥
GOOGLE_API_KEY: Google API 密钥(备选)
获取 API Key: https://aistudio.google.com/app/apikey
""",
)
parser.add_argument("--file", "-f", type=Path, required=True, help="输入 Excel 文件路径")
parser.add_argument("--output", "-o", type=Path, help="输出文件路径(默认:{原文件名}_en.xlsx")
parser.add_argument("--columns", "-c", type=str, help="要翻译的列名(逗号分隔)")
parser.add_argument("--sheet", "-s", type=str, help="要翻译的 Sheet 名称")
parser.add_argument("--model", "-m", type=str, default="gemini-2.0-flash-lite", help="Gemini 模型名称")
parser.add_argument("--api-key", "-k", type=str, help="Gemini API 密钥")
parser.add_argument("--dry-run", action="store_true", help="预览模式,不实际生成文件")
return parser.parse_args()
def main() -> int:
"""主函数"""
args = parse_args()
# 检查文件是否存在
if not args.file.exists():
print(f"❌ 文件不存在:{args.file}", file=sys.stderr)
return 1
try:
# 解析列名
columns = None
if args.columns:
columns = [c.strip() for c in args.columns.split(",")]
# 执行翻译
translate_excel_file(
input_path=args.file,
output_path=args.output,
columns=columns,
sheet_name=args.sheet,
model_name=args.model,
api_key=args.api_key,
dry_run=args.dry_run,
)
return 0
except Exception as e:
print(f"❌ 错误:{e}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())