excel-toolkit/scripts/replace_cells.py

250 lines
7.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""Replace cell contents in Excel and CSV files."""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from typing import Any
try:
import pandas as pd
except ImportError:
pd = None
SUPPORTED_SUFFIXES = {".xlsx", ".csv"}
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Replace cell content in Excel or CSV files."
)
parser.add_argument("--input", required=True, help="Input Excel/CSV file path")
parser.add_argument("--search", required=True, help="Text or pattern to search for")
parser.add_argument("--replace", required=True, help="Replacement text")
parser.add_argument(
"--output",
help="Output file path. Defaults to overwriting the input file.",
)
parser.add_argument(
"--sheet",
help="Sheet name for Excel files. Defaults to the first sheet.",
)
parser.add_argument(
"--regex",
action="store_true",
help="Treat --search as a regular expression.",
)
parser.add_argument(
"--column",
help="Specific column name to update. Defaults to all columns.",
)
parser.add_argument(
"--case",
action="store_true",
help="Use case-sensitive matching. Default is case-insensitive.",
)
return parser
def fail(message: str, exit_code: int = 1) -> int:
print(f"Error: {message}", file=sys.stderr)
return exit_code
def require_dependencies(file_type: str) -> None:
if pd is None:
package_hint = "pandas openpyxl" if file_type == "excel" else "pandas"
raise RuntimeError(
f"Missing required dependencies. Install with: pip install {package_hint}"
)
def compile_pattern(search: str, use_regex: bool, case_sensitive: bool) -> re.Pattern[str]:
pattern_text = search if use_regex else re.escape(search)
flags = 0 if case_sensitive else re.IGNORECASE
try:
return re.compile(pattern_text, flags)
except re.error as exc:
raise ValueError(f"Invalid regular expression: {exc}") from exc
def replace_value(
value: Any,
pattern: re.Pattern[str],
replacement: str,
) -> tuple[Any, int]:
if value is None:
return value, 0
if not isinstance(value, str):
return value, 0
new_value, count = pattern.subn(replacement, value)
if count == 0:
return value, 0
return new_value, count
def target_columns(dataframe: "pd.DataFrame", column_name: str | None) -> list[str]:
if column_name is None:
return list(dataframe.columns)
if column_name not in dataframe.columns:
available = ", ".join(str(col) for col in dataframe.columns)
raise KeyError(
f"Column '{column_name}' not found. Available columns: {available or '(none)'}"
)
return [column_name]
def replace_in_dataframe(
dataframe: "pd.DataFrame",
pattern: re.Pattern[str],
replacement: str,
column_name: str | None,
) -> int:
replacements = 0
for column in target_columns(dataframe, column_name):
new_values = []
for value in dataframe[column].tolist():
new_value, count = replace_value(value, pattern, replacement)
new_values.append(new_value)
replacements += count
dataframe[column] = new_values
return replacements
def read_csv(input_path: Path) -> "pd.DataFrame":
return pd.read_csv(input_path, dtype=object, keep_default_na=False, encoding="utf-8-sig")
def write_csv(dataframe: "pd.DataFrame", output_path: Path) -> None:
dataframe.to_csv(output_path, index=False, encoding="utf-8-sig")
def load_excel_sheets(input_path: Path) -> dict[str, "pd.DataFrame"]:
with pd.ExcelFile(input_path) as workbook:
return {
sheet_name: pd.read_excel(
workbook,
sheet_name=sheet_name,
dtype=object,
keep_default_na=False,
)
for sheet_name in workbook.sheet_names
}
def resolve_sheet_name(sheet_names: list[str], requested_sheet: str | None) -> str:
if not sheet_names:
raise ValueError("The Excel file does not contain any sheets.")
if requested_sheet is None:
return sheet_names[0]
if requested_sheet not in sheet_names:
available = ", ".join(sheet_names)
raise KeyError(
f"Sheet '{requested_sheet}' not found. Available sheets: {available}"
)
return requested_sheet
def write_excel(sheets: dict[str, "pd.DataFrame"], output_path: Path) -> None:
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
for sheet_name, dataframe in sheets.items():
dataframe.to_excel(writer, sheet_name=sheet_name, index=False)
def process_csv(
input_path: Path,
output_path: Path,
pattern: re.Pattern[str],
replacement: str,
column_name: str | None,
) -> int:
require_dependencies("csv")
dataframe = read_csv(input_path)
replacements = replace_in_dataframe(dataframe, pattern, replacement, column_name)
write_csv(dataframe, output_path)
return replacements
def process_excel(
input_path: Path,
output_path: Path,
sheet_name: str | None,
pattern: re.Pattern[str],
replacement: str,
column_name: str | None,
) -> tuple[int, str]:
require_dependencies("excel")
sheets = load_excel_sheets(input_path)
selected_sheet = resolve_sheet_name(list(sheets.keys()), sheet_name)
replacements = replace_in_dataframe(
sheets[selected_sheet], pattern, replacement, column_name
)
write_excel(sheets, output_path)
return replacements, selected_sheet
def main() -> int:
args = build_parser().parse_args()
input_path = Path(args.input).expanduser()
output_path = Path(args.output).expanduser() if args.output else input_path
if not input_path.exists():
return fail(f"Input file not found: {input_path}")
if not input_path.is_file():
return fail(f"Input path is not a file: {input_path}")
if input_path.suffix.lower() not in SUPPORTED_SUFFIXES:
return fail(
"Unsupported file type. Only .xlsx and .csv files are supported."
)
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
except OSError as exc:
return fail(f"Unable to create output directory: {exc}")
try:
pattern = compile_pattern(args.search, args.regex, args.case)
if input_path.suffix.lower() == ".csv":
replacements = process_csv(
input_path=input_path,
output_path=output_path,
pattern=pattern,
replacement=args.replace,
column_name=args.column,
)
print(
f"Completed. Replacements made: {replacements}. Output: {output_path}"
)
return 0
replacements, selected_sheet = process_excel(
input_path=input_path,
output_path=output_path,
sheet_name=args.sheet,
pattern=pattern,
replacement=args.replace,
column_name=args.column,
)
print(
"Completed. "
f"Sheet: {selected_sheet}. Replacements made: {replacements}. "
f"Output: {output_path}"
)
return 0
except (ValueError, KeyError, RuntimeError) as exc:
return fail(str(exc))
except PermissionError as exc:
return fail(f"Permission denied: {exc}")
except Exception as exc:
return fail(f"Unexpected failure while processing the file: {exc}")
if __name__ == "__main__":
sys.exit(main())