#!/usr/bin/env python3
"""
数据清洗模板
去除空值、格式化数据类型
"""

import pandas as pd
import sys
from pathlib import Path

def main():
    # 参数配置
    file_path = "{file}"
    output_path = "{output}"
    
    # 清洗选项
    drop_na = "{drop_na}" == "true"  # 删除包含空值的行
    fill_na_value = "{fill_na_value}"  # 填充空值的值（空字符串表示不填充）
    strip_whitespace = "{strip_whitespace}" == "true"  # 去除字符串两端的空格
    standardize_date = "{standardize_date}" == "true"  # 标准化日期格式
    
    # 读取文件
    df = pd.read_excel(file_path)
    
    print(f"原始数据: {{len(df)}} 行, {{len(df.columns)}} 列")
    
    # 删除空值
    if drop_na:
        original_rows = len(df)
        df = df.dropna()
        print(f"删除空值: 移除 {{original_rows - len(df)}} 行")
    
    # 填充空值
    if fill_na_value:
        df = df.fillna(fill_na_value)
        print(f"填充空值: 使用 '{{fill_na_value}}'")
    
    # 去除字符串两端的空格
    if strip_whitespace:
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].str.strip()
        print(f"去除空格: 已处理所有字符串列")
    
    # 标准化日期格式
    if standardize_date:
        for col in df.select_dtypes(include=['object']).columns:
            try:
                df[col] = pd.to_datetime(df[col], errors='ignore')
            except:
                pass
        print(f"标准化日期: 已尝试转换所有日期列")
    
    # 去重
    original_rows = len(df)
    df = df.drop_duplicates()
    if original_rows != len(df):
        print(f"去重: 移除 {{original_rows - len(df)}} 行重复数据")
    
    # 保存结果
    df.to_excel(output_path, index=False)
    
    print(f"\n清洗完成")
    print(f"输出文件: {{output_path}}")
    print(f"最终数据: {{len(df)}} 行, {{len(df.columns)}} 列")
    
    # 显示数据类型
    print(f"\n数据类型:")
    for col, dtype in df.dtypes.items():
        print(f"  {{col}}: {{dtype}}")

if __name__ == "__main__":
    main()