excel-toolkit/scripts/translate_excel.py

529 lines
17 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import argparse
import json
import math
import re
import sys
from pathlib import Path
from typing import Any
try:
import google.generativeai as genai # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 google-generativeai请先安装pip install google-generativeai"
) from exc
try:
from openpyxl import load_workbook, Workbook # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 openpyxl请先安装pip install openpyxl"
) from exc
try:
import pandas as pd # type: ignore
except ImportError as exc:
raise RuntimeError(
"缺少依赖 pandas请先安装pip install pandas"
) from exc
def detect_chinese(text: str) -> bool:
"""检测文本中是否包含中文字符"""
if not text or not isinstance(text, str):
return False
return bool(re.search(r"[\u4e00-\u9fff]", text))
def format_cell_value(value: Any) -> Any:
"""格式化单元格值,保持原始类型"""
if value is None:
return None
if isinstance(value, float) and math.isnan(value):
return None
return value
def get_api_key() -> str:
"""获取 Gemini API 密钥"""
api_key = genai.configure(api_key=None)
if api_key:
return api_key
# 尝试从环境变量获取
import os
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if api_key:
return api_key
raise ValueError(
"未找到 Gemini API 密钥。请设置环境变量 GEMINI_API_KEY 或 GOOGLE_API_KEY"
"或使用 --api-key 参数提供。"
)
def translate_batch(
texts: list[str],
model_name: str = "gemini-2.0-flash-lite",
api_key: str | None = None,
) -> dict[str, str]:
"""
批量翻译文本
Args:
texts: 待翻译的文本列表
model_name: 使用的模型名称
api_key: API 密钥(可选)
Returns:
原文到译文的映射字典
"""
if not texts:
return {}
# 过滤掉空文本
non_empty_texts = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
if not non_empty_texts:
return {}
# 配置 API
if api_key:
genai.configure(api_key=api_key)
else:
get_api_key() # 触发自动获取
# 选择模型
try:
model = genai.GenerativeModel(model_name)
except Exception as exc:
raise RuntimeError(f"无法加载模型 {model_name}: {exc}") from exc
# 构建批量翻译提示
# 将所有待翻译文本合并为一个请求以提高效率
text_list = "\n".join(f"{i+1}. {text}" for i, (_, text) in enumerate(non_empty_texts))
prompt = f"""请将以下中文内容翻译成英文。保持专业、准确的语言风格。
注意:
1. 只翻译中文部分,保持原有的专有名词(如人名、地名、品牌名)不变
2. 保留数字、日期、时间等格式不变
3. 技术术语使用标准英文翻译
4. 每条翻译结果单独一行,格式为:序号. 译文
待翻译内容:
{text_list}
请按顺序输出翻译结果:"""
try:
response = model.generate_content(prompt)
result_text = response.text
except Exception as exc:
raise RuntimeError(f"翻译请求失败: {exc}") from exc
# 解析翻译结果
result_map: dict[str, str] = {}
lines = result_text.strip().split("\n")
for line in lines:
line = line.strip()
# 匹配 "序号. 译文" 格式
match = re.match(r"^(\d+)\.\s*(.+)$", line)
if match:
index = int(match.group(1)) - 1 # 转为 0-based 索引
if index < len(non_empty_texts):
original_index, original_text = non_empty_texts[index]
result_map[original_text] = match.group(2)
else:
# 如果没有序号,尝试直接映射
if non_empty_texts:
original_index, original_text = non_empty_texts[0]
if original_text not in result_map:
result_map[original_text] = line
return result_map
def translate_excel_file(
input_path: Path,
output_path: Path,
columns: list[str] | None = None,
sheet_name: str | None = None,
model_name: str = "gemini-2.0-flash-lite",
api_key: str | None = None,
dry_run: bool = False,
) -> dict[str, Any]:
"""
翻译 Excel 文件
Args:
input_path: 输入文件路径
output_path: 输出文件路径
columns: 指定要翻译的列名列表
sheet_name: 指定工作表名称
model_name: 使用的模型名称
api_key: API 密钥
dry_run: 预览模式,不实际生成文件
Returns:
翻译结果统计信息
"""
# 加载工作簿
wb = load_workbook(input_path)
# 选择工作表
if sheet_name:
if sheet_name not in wb.sheetnames:
raise ValueError(f"工作表不存在: {sheet_name}")
sheets_to_translate = [sheet_name]
else:
sheets_to_translate = wb.sheetnames
# 统计信息
stats: dict[str, Any] = {
"sheets": {},
"total_cells": 0,
"translated_cells": 0,
"chinese_cells": 0,
"skipped_cells": 0,
}
# 处理每个工作表
for sheet_name in sheets_to_translate:
ws = wb[sheet_name]
sheet_stats = {
"total": 0,
"chinese": 0,
"translated": 0,
"skipped": 0,
"columns": [],
}
# 收集需要翻译的列
header_row = 1 # 默认第一行为表头
headers: list[str] = []
target_columns: list[int] = []
# 读取表头
for col in range(1, ws.max_column + 1):
cell_value = ws.cell(row=header_row, column=col).value
header = str(cell_value).strip() if cell_value else f"Column{col}"
headers.append(header)
# 如果指定了列名,检查是否匹配
if columns is None or header in columns:
target_columns.append(col)
sheet_stats["columns"].append(header)
if not target_columns:
stats["sheets"][sheet_name] = sheet_stats
continue
# 收集所有需要翻译的文本
texts_to_translate: list[str] = []
cell_positions: list[tuple[int, int]] = [] # (row, col)
for row in range(header_row + 1, ws.max_row + 1):
for col in target_columns:
cell = ws.cell(row=row, column=col)
value = cell.value
# 跳过空值、公式、数字
if value is None or isinstance(value, (int, float, bool)):
sheet_stats["skipped"] += 1
continue
if isinstance(value, float) and math.isnan(value):
sheet_stats["skipped"] += 1
continue
text = str(value).strip()
if not text:
sheet_stats["skipped"] += 1
continue
sheet_stats["total"] += 1
# 检测中文
if detect_chinese(text):
sheet_stats["chinese"] += 1
texts_to_translate.append(text)
cell_positions.append((row, col))
else:
sheet_stats["skipped"] += 1
# 批量翻译
if texts_to_translate:
print(f"翻译工作表 '{sheet_name}' 中的 {len(texts_to_translate)} 个单元格...")
translation_map = translate_batch(texts_to_translate, model_name, api_key)
# 应用翻译结果
for (row, col), original_text in zip(cell_positions, texts_to_translate):
translated = translation_map.get(original_text)
if translated:
ws.cell(row=row, column=col, value=translated)
sheet_stats["translated"] += 1
else:
sheet_stats["skipped"] += 1
stats["sheets"][sheet_name] = sheet_stats
stats["total_cells"] += sheet_stats["total"]
stats["chinese_cells"] += sheet_stats["chinese"]
stats["translated_cells"] += sheet_stats["translated"]
stats["skipped_cells"] += sheet_stats["skipped"]
# 保存文件
if not dry_run:
# 确保输出目录存在
output_path.parent.mkdir(parents=True, exist_ok=True)
wb.save(output_path)
print(f"已保存翻译结果到: {output_path}")
else:
print("预览模式:未生成文件")
return stats
def translate_csv_file(
input_path: Path,
output_path: Path,
columns: list[str] | None = None,
model_name: str = "gemini-2.0-flash-lite",
api_key: str | None = None,
dry_run: bool = False,
) -> dict[str, Any]:
"""
翻译 CSV 文件
Args:
input_path: 输入文件路径
output_path: 输出文件路径
columns: 指定要翻译的列名列表
model_name: 使用的模型名称
api_key: API 密钥
dry_run: 预览模式
Returns:
翻译结果统计信息
"""
# 检测编码
last_error: Exception | None = None
df = None
encoding = "utf-8-sig"
for enc in ("utf-8-sig", "utf-8", "gb18030"):
try:
df = pd.read_csv(input_path, encoding=enc)
encoding = enc
break
except UnicodeDecodeError as exc:
last_error = exc
continue
except Exception as exc:
last_error = exc
continue
if df is None:
raise ValueError(f"无法读取 CSV 文件: {last_error}")
# 确定要翻译的列
target_columns: list[str] = []
if columns:
for col in columns:
if col in df.columns:
target_columns.append(col)
else:
print(f"警告: 列 '{col}' 不存在,已跳过")
else:
target_columns = df.columns.tolist()
if not target_columns:
raise ValueError("没有可翻译的列")
# 统计信息
stats: dict[str, Any] = {
"sheets": {"main": {"total": 0, "chinese": 0, "translated": 0, "skipped": 0, "columns": target_columns}},
"total_cells": 0,
"translated_cells": 0,
"chinese_cells": 0,
"skipped_cells": 0,
}
# 收集需要翻译的文本
texts_to_translate: list[str] = []
cell_positions: list[tuple[int, str]] = [] # (row, col)
for col in target_columns:
for idx, value in enumerate(df[col], start=1):
# 跳过空值和 NaN
if pd.isna(value) or value == "":
stats["skipped_cells"] += 1
stats["sheets"]["main"]["skipped"] += 1
continue
# 跳过数字
if isinstance(value, (int, float)) and not isinstance(value, bool):
stats["skipped_cells"] += 1
stats["sheets"]["main"]["skipped"] += 1
continue
text = str(value).strip()
if not text:
stats["skipped_cells"] += 1
stats["sheets"]["main"]["skipped"] += 1
continue
stats["total_cells"] += 1
stats["sheets"]["main"]["total"] += 1
# 检测中文
if detect_chinese(text):
stats["chinese_cells"] += 1
stats["sheets"]["main"]["chinese"] += 1
texts_to_translate.append(text)
cell_positions.append((idx, col))
else:
stats["skipped_cells"] += 1
stats["sheets"]["main"]["skipped"] += 1
# 批量翻译
if texts_to_translate:
print(f"翻译 {len(texts_to_translate)} 个单元格...")
translation_map = translate_batch(texts_to_translate, model_name, api_key)
# 应用翻译结果
for (row_idx, col), original_text in zip(cell_positions, texts_to_translate):
translated = translation_map.get(original_text)
if translated:
df.at[row_idx - 1, col] = translated # pandas 使用 0-based 索引
stats["translated_cells"] += 1
stats["sheets"]["main"]["translated"] += 1
else:
stats["skipped_cells"] += 1
stats["sheets"]["main"]["skipped"] += 1
# 保存文件
if not dry_run:
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"已保存翻译结果到: {output_path}")
else:
print("预览模式:未生成文件")
return stats
def print_stats(stats: dict[str, Any], input_path: Path) -> None:
"""打印统计信息"""
print(f"\n翻译统计 - {input_path.name}")
print("=" * 60)
print(f"总单元格数: {stats['total_cells']}")
print(f"包含中文: {stats['chinese_cells']}")
print(f"已翻译: {stats['translated_cells']}")
print(f"跳过: {stats['skipped_cells']}")
for sheet_name, sheet_stats in stats["sheets"].items():
print(f"\n工作表: {sheet_name}")
print(f" 翻译列: {', '.join(sheet_stats['columns']) if sheet_stats['columns'] else '全部'}")
print(f" 总数: {sheet_stats['total']}, 中文: {sheet_stats['chinese']}, 已翻译: {sheet_stats['translated']}")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="翻译 Excel (.xlsx) 或 CSV 文件中的中文内容为英文"
)
parser.add_argument("--file", required=True, help="输入文件路径")
parser.add_argument("--output", help="输出文件路径(默认:{原文件名}_en.{扩展名}")
parser.add_argument(
"--columns",
help="指定要翻译的列名,多个列用逗号分隔,例如:'姓名,地址,备注'"
)
parser.add_argument("--sheet", help="指定工作表名称(仅 Excel 文件)")
parser.add_argument(
"--model",
default="gemini-2.0-flash-lite",
help="使用的 Gemini 模型默认gemini-2.0-flash-lite"
)
parser.add_argument("--api-key", help="Gemini API 密钥(也可通过环境变量 GEMINI_API_KEY 设置)")
parser.add_argument(
"--dry-run",
action="store_true",
help="预览模式:统计需要翻译的内容但不生成文件"
)
return parser.parse_args()
def main() -> int:
args = parse_args()
input_path = Path(args.file).expanduser()
try:
# 验证输入文件
if not input_path.exists():
raise FileNotFoundError(f"文件不存在: {input_path}")
if not input_path.is_file():
raise ValueError(f"路径不是文件: {input_path}")
# 确定输出路径
if args.output:
output_path = Path(args.output).expanduser()
else:
output_path = input_path.parent / f"{input_path.stem}_en{input_path.suffix}"
# 解析列名
columns: list[str] | None = None
if args.columns:
columns = [col.strip() for col in args.columns.split(",") if col.strip()]
print(f"输入文件: {input_path}")
print(f"输出文件: {output_path}")
if columns:
print(f"翻译列: {', '.join(columns)}")
if args.sheet:
print(f"工作表: {args.sheet}")
# 根据文件类型处理
suffix = input_path.suffix.lower()
if suffix == ".xlsx":
stats = translate_excel_file(
input_path=input_path,
output_path=output_path,
columns=columns,
sheet_name=args.sheet,
model_name=args.model,
api_key=args.api_key,
dry_run=args.dry_run,
)
elif suffix == ".csv":
if args.sheet:
raise ValueError("CSV 文件不支持 --sheet 参数")
stats = translate_csv_file(
input_path=input_path,
output_path=output_path,
columns=columns,
model_name=args.model,
api_key=args.api_key,
dry_run=args.dry_run,
)
else:
raise ValueError(f"不支持的文件类型: {suffix},仅支持 .xlsx 和 .csv")
# 打印统计信息
print_stats(stats, input_path)
return 0
except KeyboardInterrupt:
print("\n已取消。", file=sys.stderr)
return 130
except Exception as exc:
print(f"错误: {exc}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())