Add initial raport generator
This commit is contained in:
432
raport_convert.py
Normal file
432
raport_convert.py
Normal file
@@ -0,0 +1,432 @@
|
||||
#!/usr/bin/env python3
|
||||
import pandas as pd
|
||||
import re
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
|
||||
class PGEDataProcessor:
|
||||
"""
|
||||
Class for processing PGE energy meter reading CSV reports.
|
||||
Converts wide-format hourly data into time-aggregated outputs.
|
||||
"""
|
||||
|
||||
HOUR_COLUMNS = [f"H{str(i).zfill(2)}" for i in range(1, 25)]
|
||||
|
||||
def __init__(self, input_file: str, separator: str = ";", encoding: str = "utf-8"):
|
||||
"""
|
||||
Initialize the PGE data processor.
|
||||
|
||||
Args:
|
||||
input_file: Path to the input CSV file
|
||||
separator: CSV separator (default: ';')
|
||||
encoding: File encoding (default: 'utf-8', use 'cp1250' for Polish chars)
|
||||
"""
|
||||
self.input_file = Path(input_file)
|
||||
self.separator = separator
|
||||
self.encoding = encoding
|
||||
self.raw_data: Optional[pd.DataFrame] = None
|
||||
self.processed_data: Optional[pd.DataFrame] = None
|
||||
self.energy_types: Optional[List[str]] = None
|
||||
|
||||
if not self.input_file.exists():
|
||||
raise FileNotFoundError(f"Input file not found: {input_file}")
|
||||
|
||||
@staticmethod
|
||||
def fix_value_string(val) -> str:
|
||||
"""
|
||||
Fix malformed kWh strings like ',123', '-,123', '-,' etc.
|
||||
Returns a clean string ready for float conversion.
|
||||
"""
|
||||
if pd.isna(val):
|
||||
return "0"
|
||||
|
||||
s = str(val).strip()
|
||||
|
||||
# Remove spaces, unexpected symbols
|
||||
s = s.replace(" ", "")
|
||||
|
||||
# Convert comma decimal separator to dot
|
||||
s = s.replace(",", ".")
|
||||
|
||||
# Case 1: starts with '.' → missing leading zero
|
||||
# '.123' => '0.123'
|
||||
if re.match(r"^\.\d+", s):
|
||||
s = "0" + s
|
||||
|
||||
# Case 2: negative and missing zero
|
||||
# '-.123' => '-0.123'
|
||||
if re.match(r"^-\.\d+", s):
|
||||
s = s.replace("-.", "-0.")
|
||||
|
||||
# Case 3: '-.' alone (just a malformed zero like '-,')
|
||||
if s in ["-.", "-", ".,", ",."]:
|
||||
return "0"
|
||||
|
||||
# Case 4: string is '.' or ',' → it's actually zero
|
||||
if s in [".", ","]:
|
||||
return "0"
|
||||
|
||||
# Case 5: something like '-,' becomes '-0.'
|
||||
if re.match(r"^-,$", s):
|
||||
return "0"
|
||||
|
||||
return s
|
||||
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load and validate the input CSV data.
|
||||
|
||||
Returns:
|
||||
Raw DataFrame from the CSV file
|
||||
"""
|
||||
print(f"Loading data from: {self.input_file}")
|
||||
|
||||
try:
|
||||
self.raw_data = pd.read_csv(
|
||||
self.input_file,
|
||||
sep=self.separator,
|
||||
encoding=self.encoding
|
||||
)
|
||||
except UnicodeDecodeError:
|
||||
print(f"Warning: UTF-8 encoding failed, trying cp1250...")
|
||||
self.raw_data = pd.read_csv(
|
||||
self.input_file,
|
||||
sep=self.separator,
|
||||
encoding="cp1250"
|
||||
)
|
||||
self.encoding = "cp1250"
|
||||
|
||||
# Validate required columns
|
||||
missing_hours = [c for c in self.HOUR_COLUMNS if c not in self.raw_data.columns]
|
||||
if missing_hours:
|
||||
raise ValueError(f"Missing hour columns in input: {missing_hours}")
|
||||
|
||||
required_cols = ["DataOdczytu", "Kierunek"]
|
||||
missing_required = [c for c in required_cols if c not in self.raw_data.columns]
|
||||
if missing_required:
|
||||
raise ValueError(f"Missing required columns: {missing_required}")
|
||||
|
||||
print(f"Loaded {len(self.raw_data)} rows with {len(self.raw_data.columns)} columns")
|
||||
|
||||
# Detect unique energy types from Kierunek column
|
||||
self.energy_types = sorted(self.raw_data["Kierunek"].unique().tolist())
|
||||
print(f"Detected energy types: {self.energy_types}")
|
||||
|
||||
return self.raw_data
|
||||
|
||||
def process_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Process the raw data: fix malformed values, convert to long format, add timestamps.
|
||||
|
||||
Returns:
|
||||
Processed DataFrame in long format with timestamps
|
||||
"""
|
||||
if self.raw_data is None:
|
||||
self.load_data()
|
||||
|
||||
print("Processing data...")
|
||||
df = self.raw_data.copy()
|
||||
|
||||
# Fix malformed values in hour columns
|
||||
for col in self.HOUR_COLUMNS:
|
||||
df[col] = df[col].apply(self.fix_value_string)
|
||||
df[col] = df[col].astype(float)
|
||||
|
||||
# Convert to long format: one row per day + type + hour
|
||||
df_long = df.melt(
|
||||
id_vars=["DataOdczytu", "Kierunek"],
|
||||
value_vars=self.HOUR_COLUMNS,
|
||||
var_name="HourCol",
|
||||
value_name="Value_kWh"
|
||||
)
|
||||
|
||||
# Extract hour number (1–24)
|
||||
df_long["Hour"] = df_long["HourCol"].str[1:].astype(int)
|
||||
|
||||
# Convert date from YYYYMMDD format
|
||||
df_long["Date"] = pd.to_datetime(
|
||||
df_long["DataOdczytu"].astype(str),
|
||||
format="%Y%m%d"
|
||||
)
|
||||
|
||||
# Create full timestamp: H01 → 00:00, H24 → 23:00
|
||||
df_long["Timestamp"] = df_long["Date"] + pd.to_timedelta(
|
||||
df_long["Hour"] - 1, unit="h"
|
||||
)
|
||||
|
||||
self.processed_data = df_long
|
||||
print(f"Processed data: {len(df_long)} rows")
|
||||
return df_long
|
||||
|
||||
def generate_hourly_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Generate hourly aggregated data with energy types as columns.
|
||||
|
||||
Returns:
|
||||
DataFrame with hourly data (one row per hour, columns for energy types)
|
||||
"""
|
||||
if self.processed_data is None:
|
||||
self.process_data()
|
||||
|
||||
print("Generating hourly data...")
|
||||
|
||||
# Pivot to wide format with timestamp index
|
||||
df_hourly = self.processed_data.pivot_table(
|
||||
index="Timestamp",
|
||||
columns="Kierunek",
|
||||
values="Value_kWh",
|
||||
aggfunc="sum"
|
||||
).sort_index()
|
||||
|
||||
# Reset index and format timestamp
|
||||
df_hourly_out = df_hourly.reset_index()
|
||||
df_hourly_out["Timestamp"] = df_hourly_out["Timestamp"].dt.strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
# Ensure consistent column ordering
|
||||
available_types = [t for t in self.energy_types if t in df_hourly_out.columns]
|
||||
columns = ["Timestamp"] + available_types
|
||||
df_hourly_out = df_hourly_out[columns]
|
||||
|
||||
return df_hourly_out
|
||||
|
||||
def generate_daily_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Generate daily aggregated data.
|
||||
|
||||
Returns:
|
||||
DataFrame with daily totals
|
||||
"""
|
||||
if self.processed_data is None:
|
||||
self.process_data()
|
||||
|
||||
print("Generating daily data...")
|
||||
|
||||
# Group by date and energy type, sum the values
|
||||
df_daily = (
|
||||
self.processed_data
|
||||
.groupby(["Date", "Kierunek"])["Value_kWh"]
|
||||
.sum()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Format date and pivot
|
||||
df_daily["Date_str"] = df_daily["Date"].dt.strftime("%Y-%m-%d")
|
||||
|
||||
df_daily_pivot = df_daily.pivot_table(
|
||||
index="Date_str",
|
||||
columns="Kierunek",
|
||||
values="Value_kWh",
|
||||
aggfunc="sum"
|
||||
).sort_index()
|
||||
|
||||
df_daily_out = df_daily_pivot.reset_index().rename(columns={"Date_str": "Date"})
|
||||
|
||||
# Ensure consistent column ordering
|
||||
available_types = [t for t in self.energy_types if t in df_daily_out.columns]
|
||||
columns = ["Date"] + available_types
|
||||
df_daily_out = df_daily_out[columns]
|
||||
|
||||
return df_daily_out
|
||||
|
||||
def generate_weekly_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Generate weekly aggregated data (weeks starting Monday).
|
||||
|
||||
Returns:
|
||||
DataFrame with weekly totals
|
||||
"""
|
||||
if self.processed_data is None:
|
||||
self.process_data()
|
||||
|
||||
print("Generating weekly data...")
|
||||
|
||||
# Set timestamp as index for resampling
|
||||
df_ts = self.processed_data.set_index("Timestamp")
|
||||
|
||||
# Resample by week starting Monday
|
||||
df_weekly = (
|
||||
df_ts
|
||||
.groupby("Kierunek")
|
||||
.resample("W-MON", label="left", closed="left")["Value_kWh"]
|
||||
.sum()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Format week start date
|
||||
df_weekly["WeekStart"] = df_weekly["Timestamp"].dt.strftime("%Y-%m-%d")
|
||||
|
||||
# Pivot to wide format
|
||||
df_weekly_pivot = df_weekly.pivot_table(
|
||||
index="WeekStart",
|
||||
columns="Kierunek",
|
||||
values="Value_kWh",
|
||||
aggfunc="sum"
|
||||
).sort_index()
|
||||
|
||||
df_weekly_out = df_weekly_pivot.reset_index().rename(columns={"WeekStart": "WeekStart"})
|
||||
|
||||
# Ensure consistent column ordering
|
||||
available_types = [t for t in self.energy_types if t in df_weekly_out.columns]
|
||||
columns = ["WeekStart"] + available_types
|
||||
df_weekly_out = df_weekly_out[columns]
|
||||
|
||||
return df_weekly_out
|
||||
|
||||
def generate_monthly_data(self) -> pd.DataFrame:
|
||||
"""
|
||||
Generate monthly aggregated data.
|
||||
|
||||
Returns:
|
||||
DataFrame with monthly totals
|
||||
"""
|
||||
if self.processed_data is None:
|
||||
self.process_data()
|
||||
|
||||
print("Generating monthly data...")
|
||||
|
||||
# Set timestamp as index for resampling
|
||||
df_ts = self.processed_data.set_index("Timestamp")
|
||||
|
||||
# Resample by month start
|
||||
df_monthly = (
|
||||
df_ts
|
||||
.groupby("Kierunek")
|
||||
.resample("MS")["Value_kWh"] # MS = Month Start
|
||||
.sum()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Format month
|
||||
df_monthly["Month"] = df_monthly["Timestamp"].dt.strftime("%Y-%m")
|
||||
|
||||
# Pivot to wide format
|
||||
df_monthly_pivot = df_monthly.pivot_table(
|
||||
index="Month",
|
||||
columns="Kierunek",
|
||||
values="Value_kWh",
|
||||
aggfunc="sum"
|
||||
).sort_index()
|
||||
|
||||
df_monthly_out = df_monthly_pivot.reset_index()
|
||||
|
||||
# Ensure consistent column ordering
|
||||
available_types = [t for t in self.energy_types if t in df_monthly_out.columns]
|
||||
columns = ["Month"] + available_types
|
||||
df_monthly_out = df_monthly_out[columns]
|
||||
|
||||
return df_monthly_out
|
||||
|
||||
def save_data(self, data: pd.DataFrame, output_file: str) -> None:
|
||||
"""
|
||||
Save DataFrame to CSV with consistent formatting.
|
||||
|
||||
Args:
|
||||
data: DataFrame to save
|
||||
output_file: Output file path
|
||||
"""
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
data.to_csv(
|
||||
output_path,
|
||||
sep=self.separator,
|
||||
encoding=self.encoding,
|
||||
index=False,
|
||||
float_format="%.3f"
|
||||
)
|
||||
print(f"Saved: {output_path}")
|
||||
|
||||
def get_output_filename(self, suffix: str) -> str:
|
||||
"""
|
||||
Generate output filename based on input filename and suffix.
|
||||
|
||||
Args:
|
||||
suffix: Suffix to append (e.g., '-hourly', '-daily')
|
||||
|
||||
Returns:
|
||||
Output filename
|
||||
"""
|
||||
stem = self.input_file.stem
|
||||
extension = self.input_file.suffix
|
||||
return str(self.input_file.parent / f"{stem}{suffix}{extension}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert PGE energy meter CSV reports to time-aggregated formats"
|
||||
)
|
||||
parser.add_argument(
|
||||
"input_file",
|
||||
help="Path to the input PGE CSV file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--separator", "-s",
|
||||
default=";",
|
||||
help="CSV separator (default: ';')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--encoding", "-e",
|
||||
default="utf-8",
|
||||
help="File encoding (default: 'utf-8', use 'cp1250' for Polish characters)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir", "-o",
|
||||
help="Output directory (default: same as input file)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Initialize processor
|
||||
processor = PGEDataProcessor(
|
||||
input_file=args.input_file,
|
||||
separator=args.separator,
|
||||
encoding=args.encoding
|
||||
)
|
||||
|
||||
# Process data
|
||||
processor.load_data()
|
||||
processor.process_data()
|
||||
|
||||
# Generate and save hourly data
|
||||
hourly_data = processor.generate_hourly_data()
|
||||
hourly_output = processor.get_output_filename("-hourly")
|
||||
if args.output_dir:
|
||||
hourly_output = os.path.join(args.output_dir, os.path.basename(hourly_output))
|
||||
processor.save_data(hourly_data, hourly_output)
|
||||
|
||||
# Generate and save daily data
|
||||
daily_data = processor.generate_daily_data()
|
||||
daily_output = processor.get_output_filename("-daily")
|
||||
if args.output_dir:
|
||||
daily_output = os.path.join(args.output_dir, os.path.basename(daily_output))
|
||||
processor.save_data(daily_data, daily_output)
|
||||
|
||||
# Generate and save weekly data
|
||||
weekly_data = processor.generate_weekly_data()
|
||||
weekly_output = processor.get_output_filename("-weekly")
|
||||
if args.output_dir:
|
||||
weekly_output = os.path.join(args.output_dir, os.path.basename(weekly_output))
|
||||
processor.save_data(weekly_data, weekly_output)
|
||||
|
||||
# Generate and save monthly data
|
||||
monthly_data = processor.generate_monthly_data()
|
||||
monthly_output = processor.get_output_filename("-monthly")
|
||||
if args.output_dir:
|
||||
monthly_output = os.path.join(args.output_dir, os.path.basename(monthly_output))
|
||||
processor.save_data(monthly_data, monthly_output)
|
||||
|
||||
print("\nConversion completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user