250 lines
7.6 KiB
Python
Executable File
250 lines
7.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Replace cell contents in Excel and CSV files."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
pd = None
|
|
|
|
|
|
SUPPORTED_SUFFIXES = {".xlsx", ".csv"}
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description="Replace cell content in Excel or CSV files."
|
|
)
|
|
parser.add_argument("--input", required=True, help="Input Excel/CSV file path")
|
|
parser.add_argument("--search", required=True, help="Text or pattern to search for")
|
|
parser.add_argument("--replace", required=True, help="Replacement text")
|
|
parser.add_argument(
|
|
"--output",
|
|
help="Output file path. Defaults to overwriting the input file.",
|
|
)
|
|
parser.add_argument(
|
|
"--sheet",
|
|
help="Sheet name for Excel files. Defaults to the first sheet.",
|
|
)
|
|
parser.add_argument(
|
|
"--regex",
|
|
action="store_true",
|
|
help="Treat --search as a regular expression.",
|
|
)
|
|
parser.add_argument(
|
|
"--column",
|
|
help="Specific column name to update. Defaults to all columns.",
|
|
)
|
|
parser.add_argument(
|
|
"--case",
|
|
action="store_true",
|
|
help="Use case-sensitive matching. Default is case-insensitive.",
|
|
)
|
|
return parser
|
|
|
|
|
|
def fail(message: str, exit_code: int = 1) -> int:
|
|
print(f"Error: {message}", file=sys.stderr)
|
|
return exit_code
|
|
|
|
|
|
def require_dependencies(file_type: str) -> None:
|
|
if pd is None:
|
|
package_hint = "pandas openpyxl" if file_type == "excel" else "pandas"
|
|
raise RuntimeError(
|
|
f"Missing required dependencies. Install with: pip install {package_hint}"
|
|
)
|
|
|
|
|
|
def compile_pattern(search: str, use_regex: bool, case_sensitive: bool) -> re.Pattern[str]:
|
|
pattern_text = search if use_regex else re.escape(search)
|
|
flags = 0 if case_sensitive else re.IGNORECASE
|
|
try:
|
|
return re.compile(pattern_text, flags)
|
|
except re.error as exc:
|
|
raise ValueError(f"Invalid regular expression: {exc}") from exc
|
|
|
|
|
|
def replace_value(
|
|
value: Any,
|
|
pattern: re.Pattern[str],
|
|
replacement: str,
|
|
) -> tuple[Any, int]:
|
|
if value is None:
|
|
return value, 0
|
|
if not isinstance(value, str):
|
|
return value, 0
|
|
|
|
new_value, count = pattern.subn(replacement, value)
|
|
if count == 0:
|
|
return value, 0
|
|
return new_value, count
|
|
|
|
|
|
def target_columns(dataframe: "pd.DataFrame", column_name: str | None) -> list[str]:
|
|
if column_name is None:
|
|
return list(dataframe.columns)
|
|
if column_name not in dataframe.columns:
|
|
available = ", ".join(str(col) for col in dataframe.columns)
|
|
raise KeyError(
|
|
f"Column '{column_name}' not found. Available columns: {available or '(none)'}"
|
|
)
|
|
return [column_name]
|
|
|
|
|
|
def replace_in_dataframe(
|
|
dataframe: "pd.DataFrame",
|
|
pattern: re.Pattern[str],
|
|
replacement: str,
|
|
column_name: str | None,
|
|
) -> int:
|
|
replacements = 0
|
|
for column in target_columns(dataframe, column_name):
|
|
new_values = []
|
|
for value in dataframe[column].tolist():
|
|
new_value, count = replace_value(value, pattern, replacement)
|
|
new_values.append(new_value)
|
|
replacements += count
|
|
dataframe[column] = new_values
|
|
return replacements
|
|
|
|
|
|
def read_csv(input_path: Path) -> "pd.DataFrame":
|
|
return pd.read_csv(input_path, dtype=object, keep_default_na=False, encoding="utf-8-sig")
|
|
|
|
|
|
def write_csv(dataframe: "pd.DataFrame", output_path: Path) -> None:
|
|
dataframe.to_csv(output_path, index=False, encoding="utf-8-sig")
|
|
|
|
|
|
def load_excel_sheets(input_path: Path) -> dict[str, "pd.DataFrame"]:
|
|
with pd.ExcelFile(input_path) as workbook:
|
|
return {
|
|
sheet_name: pd.read_excel(
|
|
workbook,
|
|
sheet_name=sheet_name,
|
|
dtype=object,
|
|
keep_default_na=False,
|
|
)
|
|
for sheet_name in workbook.sheet_names
|
|
}
|
|
|
|
|
|
def resolve_sheet_name(sheet_names: list[str], requested_sheet: str | None) -> str:
|
|
if not sheet_names:
|
|
raise ValueError("The Excel file does not contain any sheets.")
|
|
if requested_sheet is None:
|
|
return sheet_names[0]
|
|
if requested_sheet not in sheet_names:
|
|
available = ", ".join(sheet_names)
|
|
raise KeyError(
|
|
f"Sheet '{requested_sheet}' not found. Available sheets: {available}"
|
|
)
|
|
return requested_sheet
|
|
|
|
|
|
def write_excel(sheets: dict[str, "pd.DataFrame"], output_path: Path) -> None:
|
|
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
|
for sheet_name, dataframe in sheets.items():
|
|
dataframe.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
|
|
|
|
def process_csv(
|
|
input_path: Path,
|
|
output_path: Path,
|
|
pattern: re.Pattern[str],
|
|
replacement: str,
|
|
column_name: str | None,
|
|
) -> int:
|
|
require_dependencies("csv")
|
|
dataframe = read_csv(input_path)
|
|
replacements = replace_in_dataframe(dataframe, pattern, replacement, column_name)
|
|
write_csv(dataframe, output_path)
|
|
return replacements
|
|
|
|
|
|
def process_excel(
|
|
input_path: Path,
|
|
output_path: Path,
|
|
sheet_name: str | None,
|
|
pattern: re.Pattern[str],
|
|
replacement: str,
|
|
column_name: str | None,
|
|
) -> tuple[int, str]:
|
|
require_dependencies("excel")
|
|
sheets = load_excel_sheets(input_path)
|
|
selected_sheet = resolve_sheet_name(list(sheets.keys()), sheet_name)
|
|
replacements = replace_in_dataframe(
|
|
sheets[selected_sheet], pattern, replacement, column_name
|
|
)
|
|
write_excel(sheets, output_path)
|
|
return replacements, selected_sheet
|
|
|
|
|
|
def main() -> int:
|
|
args = build_parser().parse_args()
|
|
|
|
input_path = Path(args.input).expanduser()
|
|
output_path = Path(args.output).expanduser() if args.output else input_path
|
|
|
|
if not input_path.exists():
|
|
return fail(f"Input file not found: {input_path}")
|
|
if not input_path.is_file():
|
|
return fail(f"Input path is not a file: {input_path}")
|
|
if input_path.suffix.lower() not in SUPPORTED_SUFFIXES:
|
|
return fail(
|
|
"Unsupported file type. Only .xlsx and .csv files are supported."
|
|
)
|
|
|
|
try:
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
except OSError as exc:
|
|
return fail(f"Unable to create output directory: {exc}")
|
|
|
|
try:
|
|
pattern = compile_pattern(args.search, args.regex, args.case)
|
|
if input_path.suffix.lower() == ".csv":
|
|
replacements = process_csv(
|
|
input_path=input_path,
|
|
output_path=output_path,
|
|
pattern=pattern,
|
|
replacement=args.replace,
|
|
column_name=args.column,
|
|
)
|
|
print(
|
|
f"Completed. Replacements made: {replacements}. Output: {output_path}"
|
|
)
|
|
return 0
|
|
|
|
replacements, selected_sheet = process_excel(
|
|
input_path=input_path,
|
|
output_path=output_path,
|
|
sheet_name=args.sheet,
|
|
pattern=pattern,
|
|
replacement=args.replace,
|
|
column_name=args.column,
|
|
)
|
|
print(
|
|
"Completed. "
|
|
f"Sheet: {selected_sheet}. Replacements made: {replacements}. "
|
|
f"Output: {output_path}"
|
|
)
|
|
return 0
|
|
except (ValueError, KeyError, RuntimeError) as exc:
|
|
return fail(str(exc))
|
|
except PermissionError as exc:
|
|
return fail(f"Permission denied: {exc}")
|
|
except Exception as exc:
|
|
return fail(f"Unexpected failure while processing the file: {exc}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|