excel-toolkit/script_templates/data_clean.py

#!/usr/bin/env python3
"""
数据清洗模板
去除空值、格式化数据类型
"""

import pandas as pd
import sys
from pathlib import Path

def main():
    # 参数配置
    file_path = "{file}"
    output_path = "{output}"
    
    # 清洗选项
    drop_na = "{drop_na}" == "true"  # 删除包含空值的行
    fill_na_value = "{fill_na_value}"  # 填充空值的值（空字符串表示不填充）
    strip_whitespace = "{strip_whitespace}" == "true"  # 去除字符串两端的空格
    standardize_date = "{standardize_date}" == "true"  # 标准化日期格式
    
    # 读取文件
    df = pd.read_excel(file_path)
    
    print(f"原始数据: {{len(df)}} 行, {{len(df.columns)}} 列")
    
    # 删除空值
    if drop_na:
        original_rows = len(df)
        df = df.dropna()
        print(f"删除空值: 移除 {{original_rows - len(df)}} 行")
    
    # 填充空值
    if fill_na_value:
        df = df.fillna(fill_na_value)
        print(f"填充空值: 使用 '{{fill_na_value}}'")
    
    # 去除字符串两端的空格
    if strip_whitespace:
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].str.strip()
        print(f"去除空格: 已处理所有字符串列")
    
    # 标准化日期格式
    if standardize_date:
        for col in df.select_dtypes(include=['object']).columns:
            try:
                df[col] = pd.to_datetime(df[col], errors='ignore')
            except:
                pass
        print(f"标准化日期: 已尝试转换所有日期列")
    
    # 去重
    original_rows = len(df)
    df = df.drop_duplicates()
    if original_rows != len(df):
        print(f"去重: 移除 {{original_rows - len(df)}} 行重复数据")
    
    # 保存结果
    df.to_excel(output_path, index=False)
    
    print(f"\n清洗完成")
    print(f"输出文件: {{output_path}}")
    print(f"最终数据: {{len(df)}} 行, {{len(df.columns)}} 列")
    
    # 显示数据类型
    print(f"\n数据类型:")
    for col, dtype in df.dtypes.items():
        print(f"  {{col}}: {{dtype}}")

if __name__ == "__main__":
    main()
Initial commit: excel-toolkit skill Excel 文件智能处理工具： - 基础操作：读取/合并/筛选/替换/批量处理 - 自扩展能力：根据自然语言自动生成脚本 - 预置模板：货币转换/数据透视/数据清洗/列计算 - 支持 .xlsx 和 .csv 格式 2026-03-11 04:20:00 +00:00			`#!/usr/bin/env python3`
			`"""`
			`数据清洗模板`
			`去除空值、格式化数据类型`
			`"""`

			`import pandas as pd`
			`import sys`
			`from pathlib import Path`

			`def main():`
			`# 参数配置`
			`file_path = "{file}"`
			`output_path = "{output}"`

			`# 清洗选项`
			`drop_na = "{drop_na}" == "true" # 删除包含空值的行`
			`fill_na_value = "{fill_na_value}" # 填充空值的值（空字符串表示不填充）`
			`strip_whitespace = "{strip_whitespace}" == "true" # 去除字符串两端的空格`
			`standardize_date = "{standardize_date}" == "true" # 标准化日期格式`

			`# 读取文件`
			`df = pd.read_excel(file_path)`

			`print(f"原始数据: {{len(df)}} 行, {{len(df.columns)}} 列")`

			`# 删除空值`
			`if drop_na:`
			`original_rows = len(df)`
			`df = df.dropna()`
			`print(f"删除空值: 移除 {{original_rows - len(df)}} 行")`

			`# 填充空值`
			`if fill_na_value:`
			`df = df.fillna(fill_na_value)`
			`print(f"填充空值: 使用 '{{fill_na_value}}'")`

			`# 去除字符串两端的空格`
			`if strip_whitespace:`
			`for col in df.select_dtypes(include=['object']).columns:`
			`df[col] = df[col].str.strip()`
			`print(f"去除空格: 已处理所有字符串列")`

			`# 标准化日期格式`
			`if standardize_date:`
			`for col in df.select_dtypes(include=['object']).columns:`
			`try:`
			`df[col] = pd.to_datetime(df[col], errors='ignore')`
			`except:`
			`pass`
			`print(f"标准化日期: 已尝试转换所有日期列")`

			`# 去重`
			`original_rows = len(df)`
			`df = df.drop_duplicates()`
			`if original_rows != len(df):`
			`print(f"去重: 移除 {{original_rows - len(df)}} 行重复数据")`

			`# 保存结果`
			`df.to_excel(output_path, index=False)`

			`print(f"\n清洗完成")`
			`print(f"输出文件: {{output_path}}")`
			`print(f"最终数据: {{len(df)}} 行, {{len(df.columns)}} 列")`

			`# 显示数据类型`
			`print(f"\n数据类型:")`
			`for col, dtype in df.dtypes.items():`
			`print(f" {{col}}: {{dtype}}")`

			`if __name__ == "__main__":`
			`main()`