Add initial raport generator

2025-12-03 00:36:34 +01:00
commit f884460eb0
5 changed files with 2260 additions and 0 deletions
--- a/raport_convert.py
+++ b/raport_convert.py
@@ -0,0 +1,432 @@
+#!/usr/bin/env python3
+import pandas as pd
+import re
+import argparse
+import os
+from pathlib import Path
+from typing import Optional, Dict, List
+
+
+class PGEDataProcessor:
+    """
+    Class for processing PGE energy meter reading CSV reports.
+    Converts wide-format hourly data into time-aggregated outputs.
+    """
+    
+    HOUR_COLUMNS = [f"H{str(i).zfill(2)}" for i in range(1, 25)]
+    
+    def __init__(self, input_file: str, separator: str = ";", encoding: str = "utf-8"):
+        """
+        Initialize the PGE data processor.
+        
+        Args:
+            input_file: Path to the input CSV file
+            separator: CSV separator (default: ';')
+            encoding: File encoding (default: 'utf-8', use 'cp1250' for Polish chars)
+        """
+        self.input_file = Path(input_file)
+        self.separator = separator
+        self.encoding = encoding
+        self.raw_data: Optional[pd.DataFrame] = None
+        self.processed_data: Optional[pd.DataFrame] = None
+        self.energy_types: Optional[List[str]] = None
+        
+        if not self.input_file.exists():
+            raise FileNotFoundError(f"Input file not found: {input_file}")
+    
+    @staticmethod
+    def fix_value_string(val) -> str:
+        """
+        Fix malformed kWh strings like ',123', '-,123', '-,' etc.
+        Returns a clean string ready for float conversion.
+        """
+        if pd.isna(val):
+            return "0"
+
+        s = str(val).strip()
+
+        # Remove spaces, unexpected symbols
+        s = s.replace(" ", "")
+
+        # Convert comma decimal separator to dot
+        s = s.replace(",", ".")
+
+        # Case 1: starts with '.' → missing leading zero
+        #   '.123' => '0.123'
+        if re.match(r"^\.\d+", s):
+            s = "0" + s
+
+        # Case 2: negative and missing zero
+        #   '-.123' => '-0.123'
+        if re.match(r"^-\.\d+", s):
+            s = s.replace("-.", "-0.")
+
+        # Case 3: '-.' alone (just a malformed zero like '-,')
+        if s in ["-.", "-", ".,", ",."]:
+            return "0"
+
+        # Case 4: string is '.' or ',' → it's actually zero
+        if s in [".", ","]:
+            return "0"
+
+        # Case 5: something like '-,' becomes '-0.'
+        if re.match(r"^-,$", s):
+            return "0"
+
+        return s
+    
+    def load_data(self) -> pd.DataFrame:
+        """
+        Load and validate the input CSV data.
+        
+        Returns:
+            Raw DataFrame from the CSV file
+        """
+        print(f"Loading data from: {self.input_file}")
+        
+        try:
+            self.raw_data = pd.read_csv(
+                self.input_file, 
+                sep=self.separator, 
+                encoding=self.encoding
+            )
+        except UnicodeDecodeError:
+            print(f"Warning: UTF-8 encoding failed, trying cp1250...")
+            self.raw_data = pd.read_csv(
+                self.input_file, 
+                sep=self.separator, 
+                encoding="cp1250"
+            )
+            self.encoding = "cp1250"
+        
+        # Validate required columns
+        missing_hours = [c for c in self.HOUR_COLUMNS if c not in self.raw_data.columns]
+        if missing_hours:
+            raise ValueError(f"Missing hour columns in input: {missing_hours}")
+        
+        required_cols = ["DataOdczytu", "Kierunek"]
+        missing_required = [c for c in required_cols if c not in self.raw_data.columns]
+        if missing_required:
+            raise ValueError(f"Missing required columns: {missing_required}")
+        
+        print(f"Loaded {len(self.raw_data)} rows with {len(self.raw_data.columns)} columns")
+        
+        # Detect unique energy types from Kierunek column
+        self.energy_types = sorted(self.raw_data["Kierunek"].unique().tolist())
+        print(f"Detected energy types: {self.energy_types}")
+        
+        return self.raw_data
+    
+    def process_data(self) -> pd.DataFrame:
+        """
+        Process the raw data: fix malformed values, convert to long format, add timestamps.
+        
+        Returns:
+            Processed DataFrame in long format with timestamps
+        """
+        if self.raw_data is None:
+            self.load_data()
+        
+        print("Processing data...")
+        df = self.raw_data.copy()
+        
+        # Fix malformed values in hour columns
+        for col in self.HOUR_COLUMNS:
+            df[col] = df[col].apply(self.fix_value_string)
+            df[col] = df[col].astype(float)
+        
+        # Convert to long format: one row per day + type + hour
+        df_long = df.melt(
+            id_vars=["DataOdczytu", "Kierunek"],
+            value_vars=self.HOUR_COLUMNS,
+            var_name="HourCol",
+            value_name="Value_kWh"
+        )
+        
+        # Extract hour number (1–24)
+        df_long["Hour"] = df_long["HourCol"].str[1:].astype(int)
+        
+        # Convert date from YYYYMMDD format
+        df_long["Date"] = pd.to_datetime(
+            df_long["DataOdczytu"].astype(str), 
+            format="%Y%m%d"
+        )
+        
+        # Create full timestamp: H01 → 00:00, H24 → 23:00
+        df_long["Timestamp"] = df_long["Date"] + pd.to_timedelta(
+            df_long["Hour"] - 1, unit="h"
+        )
+        
+        self.processed_data = df_long
+        print(f"Processed data: {len(df_long)} rows")
+        return df_long
+    
+    def generate_hourly_data(self) -> pd.DataFrame:
+        """
+        Generate hourly aggregated data with energy types as columns.
+        
+        Returns:
+            DataFrame with hourly data (one row per hour, columns for energy types)
+        """
+        if self.processed_data is None:
+            self.process_data()
+        
+        print("Generating hourly data...")
+        
+        # Pivot to wide format with timestamp index
+        df_hourly = self.processed_data.pivot_table(
+            index="Timestamp",
+            columns="Kierunek",
+            values="Value_kWh",
+            aggfunc="sum"
+        ).sort_index()
+        
+        # Reset index and format timestamp
+        df_hourly_out = df_hourly.reset_index()
+        df_hourly_out["Timestamp"] = df_hourly_out["Timestamp"].dt.strftime("%Y-%m-%d %H:%M")
+        
+        # Ensure consistent column ordering
+        available_types = [t for t in self.energy_types if t in df_hourly_out.columns]
+        columns = ["Timestamp"] + available_types
+        df_hourly_out = df_hourly_out[columns]
+        
+        return df_hourly_out
+    
+    def generate_daily_data(self) -> pd.DataFrame:
+        """
+        Generate daily aggregated data.
+        
+        Returns:
+            DataFrame with daily totals
+        """
+        if self.processed_data is None:
+            self.process_data()
+        
+        print("Generating daily data...")
+        
+        # Group by date and energy type, sum the values
+        df_daily = (
+            self.processed_data
+            .groupby(["Date", "Kierunek"])["Value_kWh"]
+            .sum()
+            .reset_index()
+        )
+        
+        # Format date and pivot
+        df_daily["Date_str"] = df_daily["Date"].dt.strftime("%Y-%m-%d")
+        
+        df_daily_pivot = df_daily.pivot_table(
+            index="Date_str",
+            columns="Kierunek",
+            values="Value_kWh",
+            aggfunc="sum"
+        ).sort_index()
+        
+        df_daily_out = df_daily_pivot.reset_index().rename(columns={"Date_str": "Date"})
+        
+        # Ensure consistent column ordering
+        available_types = [t for t in self.energy_types if t in df_daily_out.columns]
+        columns = ["Date"] + available_types
+        df_daily_out = df_daily_out[columns]
+        
+        return df_daily_out
+    
+    def generate_weekly_data(self) -> pd.DataFrame:
+        """
+        Generate weekly aggregated data (weeks starting Monday).
+        
+        Returns:
+            DataFrame with weekly totals
+        """
+        if self.processed_data is None:
+            self.process_data()
+        
+        print("Generating weekly data...")
+        
+        # Set timestamp as index for resampling
+        df_ts = self.processed_data.set_index("Timestamp")
+        
+        # Resample by week starting Monday
+        df_weekly = (
+            df_ts
+            .groupby("Kierunek")
+            .resample("W-MON", label="left", closed="left")["Value_kWh"]
+            .sum()
+            .reset_index()
+        )
+        
+        # Format week start date
+        df_weekly["WeekStart"] = df_weekly["Timestamp"].dt.strftime("%Y-%m-%d")
+        
+        # Pivot to wide format
+        df_weekly_pivot = df_weekly.pivot_table(
+            index="WeekStart",
+            columns="Kierunek",
+            values="Value_kWh",
+            aggfunc="sum"
+        ).sort_index()
+        
+        df_weekly_out = df_weekly_pivot.reset_index().rename(columns={"WeekStart": "WeekStart"})
+        
+        # Ensure consistent column ordering
+        available_types = [t for t in self.energy_types if t in df_weekly_out.columns]
+        columns = ["WeekStart"] + available_types
+        df_weekly_out = df_weekly_out[columns]
+        
+        return df_weekly_out
+    
+    def generate_monthly_data(self) -> pd.DataFrame:
+        """
+        Generate monthly aggregated data.
+        
+        Returns:
+            DataFrame with monthly totals
+        """
+        if self.processed_data is None:
+            self.process_data()
+        
+        print("Generating monthly data...")
+        
+        # Set timestamp as index for resampling
+        df_ts = self.processed_data.set_index("Timestamp")
+        
+        # Resample by month start
+        df_monthly = (
+            df_ts
+            .groupby("Kierunek")
+            .resample("MS")["Value_kWh"]  # MS = Month Start
+            .sum()
+            .reset_index()
+        )
+        
+        # Format month
+        df_monthly["Month"] = df_monthly["Timestamp"].dt.strftime("%Y-%m")
+        
+        # Pivot to wide format
+        df_monthly_pivot = df_monthly.pivot_table(
+            index="Month",
+            columns="Kierunek",
+            values="Value_kWh",
+            aggfunc="sum"
+        ).sort_index()
+        
+        df_monthly_out = df_monthly_pivot.reset_index()
+        
+        # Ensure consistent column ordering
+        available_types = [t for t in self.energy_types if t in df_monthly_out.columns]
+        columns = ["Month"] + available_types
+        df_monthly_out = df_monthly_out[columns]
+        
+        return df_monthly_out
+    
+    def save_data(self, data: pd.DataFrame, output_file: str) -> None:
+        """
+        Save DataFrame to CSV with consistent formatting.
+        
+        Args:
+            data: DataFrame to save
+            output_file: Output file path
+        """
+        output_path = Path(output_file)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        data.to_csv(
+            output_path,
+            sep=self.separator,
+            encoding=self.encoding,
+            index=False,
+            float_format="%.3f"
+        )
+        print(f"Saved: {output_path}")
+    
+    def get_output_filename(self, suffix: str) -> str:
+        """
+        Generate output filename based on input filename and suffix.
+        
+        Args:
+            suffix: Suffix to append (e.g., '-hourly', '-daily')
+            
+        Returns:
+            Output filename
+        """
+        stem = self.input_file.stem
+        extension = self.input_file.suffix
+        return str(self.input_file.parent / f"{stem}{suffix}{extension}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert PGE energy meter CSV reports to time-aggregated formats"
+    )
+    parser.add_argument(
+        "input_file",
+        help="Path to the input PGE CSV file"
+    )
+    parser.add_argument(
+        "--separator", "-s",
+        default=";",
+        help="CSV separator (default: ';')"
+    )
+    parser.add_argument(
+        "--encoding", "-e",
+        default="utf-8",
+        help="File encoding (default: 'utf-8', use 'cp1250' for Polish characters)"
+    )
+    parser.add_argument(
+        "--output-dir", "-o",
+        help="Output directory (default: same as input file)"
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        # Initialize processor
+        processor = PGEDataProcessor(
+            input_file=args.input_file,
+            separator=args.separator,
+            encoding=args.encoding
+        )
+        
+        # Process data
+        processor.load_data()
+        processor.process_data()
+        
+        # Generate and save hourly data
+        hourly_data = processor.generate_hourly_data()
+        hourly_output = processor.get_output_filename("-hourly")
+        if args.output_dir:
+            hourly_output = os.path.join(args.output_dir, os.path.basename(hourly_output))
+        processor.save_data(hourly_data, hourly_output)
+        
+        # Generate and save daily data
+        daily_data = processor.generate_daily_data()
+        daily_output = processor.get_output_filename("-daily")
+        if args.output_dir:
+            daily_output = os.path.join(args.output_dir, os.path.basename(daily_output))
+        processor.save_data(daily_data, daily_output)
+        
+        # Generate and save weekly data
+        weekly_data = processor.generate_weekly_data()
+        weekly_output = processor.get_output_filename("-weekly")
+        if args.output_dir:
+            weekly_output = os.path.join(args.output_dir, os.path.basename(weekly_output))
+        processor.save_data(weekly_data, weekly_output)
+        
+        # Generate and save monthly data
+        monthly_data = processor.generate_monthly_data()
+        monthly_output = processor.get_output_filename("-monthly")
+        if args.output_dir:
+            monthly_output = os.path.join(args.output_dir, os.path.basename(monthly_output))
+        processor.save_data(monthly_data, monthly_output)
+        
+        print("\nConversion completed successfully!")
+        
+    except Exception as e:
+        print(f"Error: {e}")
+        return 1
+    
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())