73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
数据清洗模板
|
||
|
|
去除空值、格式化数据类型
|
||
|
|
"""
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
def main():
|
||
|
|
# 参数配置
|
||
|
|
file_path = "{file}"
|
||
|
|
output_path = "{output}"
|
||
|
|
|
||
|
|
# 清洗选项
|
||
|
|
drop_na = "{drop_na}" == "true" # 删除包含空值的行
|
||
|
|
fill_na_value = "{fill_na_value}" # 填充空值的值(空字符串表示不填充)
|
||
|
|
strip_whitespace = "{strip_whitespace}" == "true" # 去除字符串两端的空格
|
||
|
|
standardize_date = "{standardize_date}" == "true" # 标准化日期格式
|
||
|
|
|
||
|
|
# 读取文件
|
||
|
|
df = pd.read_excel(file_path)
|
||
|
|
|
||
|
|
print(f"原始数据: {{len(df)}} 行, {{len(df.columns)}} 列")
|
||
|
|
|
||
|
|
# 删除空值
|
||
|
|
if drop_na:
|
||
|
|
original_rows = len(df)
|
||
|
|
df = df.dropna()
|
||
|
|
print(f"删除空值: 移除 {{original_rows - len(df)}} 行")
|
||
|
|
|
||
|
|
# 填充空值
|
||
|
|
if fill_na_value:
|
||
|
|
df = df.fillna(fill_na_value)
|
||
|
|
print(f"填充空值: 使用 '{{fill_na_value}}'")
|
||
|
|
|
||
|
|
# 去除字符串两端的空格
|
||
|
|
if strip_whitespace:
|
||
|
|
for col in df.select_dtypes(include=['object']).columns:
|
||
|
|
df[col] = df[col].str.strip()
|
||
|
|
print(f"去除空格: 已处理所有字符串列")
|
||
|
|
|
||
|
|
# 标准化日期格式
|
||
|
|
if standardize_date:
|
||
|
|
for col in df.select_dtypes(include=['object']).columns:
|
||
|
|
try:
|
||
|
|
df[col] = pd.to_datetime(df[col], errors='ignore')
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
print(f"标准化日期: 已尝试转换所有日期列")
|
||
|
|
|
||
|
|
# 去重
|
||
|
|
original_rows = len(df)
|
||
|
|
df = df.drop_duplicates()
|
||
|
|
if original_rows != len(df):
|
||
|
|
print(f"去重: 移除 {{original_rows - len(df)}} 行重复数据")
|
||
|
|
|
||
|
|
# 保存结果
|
||
|
|
df.to_excel(output_path, index=False)
|
||
|
|
|
||
|
|
print(f"\n清洗完成")
|
||
|
|
print(f"输出文件: {{output_path}}")
|
||
|
|
print(f"最终数据: {{len(df)}} 行, {{len(df.columns)}} 列")
|
||
|
|
|
||
|
|
# 显示数据类型
|
||
|
|
print(f"\n数据类型:")
|
||
|
|
for col, dtype in df.dtypes.items():
|
||
|
|
print(f" {{col}}: {{dtype}}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|