pge-raport-generator/visualize_data.py

#!/usr/bin/env python3
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import argparse
import os
from pathlib import Path
from typing import Optional, List, Dict
import warnings

# Suppress matplotlib warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

class PGEDataVisualizer:
    """
    Class for visualizing PGE energy meter data from converted CSV reports.
    Creates various charts for different time aggregations.
    """

    def __init__(self, base_filename: str, data_dir: str = ".", output_dir: str = "charts",
                 buy_price: float = 1.4, sell_price: float = 0.2):
        """
        Initialize the visualizer.

        Args:
            base_filename: Base filename without suffix (e.g., '202512011130_590543580100122940_46f7640c-07d2-4e64-9ba7-20b8c2eeca7d')
            data_dir: Directory containing the CSV files
            output_dir: Directory to save charts
            buy_price: Price per kWh for energy consumed from grid (default: 1.4 PLN)
            sell_price: Price per kWh for energy delivered to grid (default: 0.2 PLN)
        """
        self.base_filename = base_filename
        self.data_dir = Path(data_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.buy_price = buy_price
        self.sell_price = sell_price

        # Polish energy type colors (consistent across all charts)
        self.colors = {
            'Czynna zbilansowana': '#2E86AB',  # Blue - Net energy
            'Czynna oddana': '#A23B72',       # Purple - Energy delivered
            'Czynna pobrana': '#F18F01'       # Orange - Energy consumed
        }

        # Set matplotlib style
        plt.style.use('seaborn-v0_8-darkgrid')
        sns.set_palette("husl")

        # Configure Polish locale for better formatting
        self.setup_locale()

    def setup_locale(self):
        """Setup matplotlib for better Polish text rendering."""
        plt.rcParams['font.size'] = 10
        plt.rcParams['axes.titlesize'] = 14
        plt.rcParams['axes.labelsize'] = 12
        plt.rcParams['xtick.labelsize'] = 10
        plt.rcParams['ytick.labelsize'] = 10
        plt.rcParams['legend.fontsize'] = 10
        plt.rcParams['figure.titlesize'] = 16

    def load_data(self, suffix: str) -> Optional[pd.DataFrame]:
        """
        Load data from CSV file with given suffix.

        Args:
            suffix: File suffix (e.g., '-hourly', '-daily')

        Returns:
            DataFrame or None if file doesn't exist
        """
        filename = f"{self.base_filename}{suffix}.csv"
        filepath = self.data_dir / filename

        if not filepath.exists():
            print(f"Warning: File not found: {filepath}")
            return None

        try:
            df = pd.read_csv(filepath, sep=';')
            print(f"Loaded {suffix} data: {len(df)} rows")
            return df
        except Exception as e:
            print(f"Error loading {filepath}: {e}")
            return None

    def parse_timestamps(self, df: pd.DataFrame, time_col: str) -> pd.DataFrame:
        """
        Parse timestamp column to datetime.

        Args:
            df: DataFrame with time column
            time_col: Name of time column

        Returns:
            DataFrame with parsed datetime column
        """
        df = df.copy()

        if time_col == 'Timestamp':
            df[time_col] = pd.to_datetime(df[time_col])
        elif time_col == 'Date':
            df[time_col] = pd.to_datetime(df[time_col])
        elif time_col == 'WeekStart':
            df[time_col] = pd.to_datetime(df[time_col])
        elif time_col == 'Month':
            df[time_col] = pd.to_datetime(df[time_col] + '-01')

        return df

    def calculate_costs(self, df: pd.DataFrame, energy_cols: List[str]) -> pd.DataFrame:
        """
        Calculate energy costs based on buy/sell prices.

        Args:
            df: DataFrame with energy columns
            energy_cols: List of energy column names

        Returns:
            DataFrame with added cost columns
        """
        df = df.copy()

        if 'Czynna pobrana' in energy_cols:
            df['Koszt_pobrana'] = df['Czynna pobrana'] * self.buy_price

        if 'Czynna oddana' in energy_cols:
            df['Przychód_oddana'] = df['Czynna oddana'] * self.sell_price

        # Net cost (positive = cost, negative = profit)
        if 'Koszt_pobrana' in df.columns and 'Przychód_oddana' in df.columns:
            df['Koszt_netto'] = df['Koszt_pobrana'] - df['Przychód_oddana']
        elif 'Koszt_pobrana' in df.columns:
            df['Koszt_netto'] = df['Koszt_pobrana']
        elif 'Przychód_oddana' in df.columns:
            df['Koszt_netto'] = -df['Przychód_oddana']

        return df

    def plot_time_series(self, df: pd.DataFrame, time_col: str, title: str,
                        filename: str, figsize: tuple = (15, 8)):
        """
        Create time series plot for energy data.

        Args:
            df: DataFrame with time and energy columns
            time_col: Name of time column
            title: Plot title
            filename: Output filename
            figsize: Figure size
        """
        if df is None or df.empty:
            return

        df = self.parse_timestamps(df, time_col)

        fig, ax = plt.subplots(figsize=figsize)

        energy_cols = [col for col in df.columns if col != time_col]

        for col in energy_cols:
            if col in self.colors:
                color = self.colors[col]
            else:
                color = None
            ax.plot(df[time_col], df[col], label=col, linewidth=2, color=color, marker='o', markersize=3)

        ax.set_title(title, fontweight='bold', pad=20)
        ax.set_xlabel('Czas', fontweight='bold')
        ax.set_ylabel('Energia [kWh]', fontweight='bold')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax.grid(True, alpha=0.3)

        # Format x-axis based on data type
        if time_col in ['Timestamp', 'Date']:
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
        elif time_col == 'WeekStart':
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

        plt.tight_layout()
        output_path = self.output_dir / f"{filename}.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved chart: {output_path}")

    def plot_stacked_area(self, df: pd.DataFrame, time_col: str, title: str,
                         filename: str, figsize: tuple = (15, 8)):
        """
        Create stacked area chart for energy data.
        """
        if df is None or df.empty:
            return

        df = self.parse_timestamps(df, time_col)

        fig, ax = plt.subplots(figsize=figsize)

        energy_cols = [col for col in df.columns if col != time_col]

        # Prepare data for stacking (only positive values)
        positive_data = df[energy_cols].clip(lower=0)

        colors_list = [self.colors.get(col, None) for col in energy_cols]

        ax.stackplot(df[time_col], *[positive_data[col] for col in energy_cols],
                    labels=energy_cols, colors=colors_list, alpha=0.7)

        ax.set_title(title, fontweight='bold', pad=20)
        ax.set_xlabel('Czas', fontweight='bold')
        ax.set_ylabel('Energia [kWh]', fontweight='bold')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax.grid(True, alpha=0.3)

        # Format x-axis
        if time_col in ['Timestamp', 'Date', 'WeekStart']:
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

        plt.tight_layout()
        output_path = self.output_dir / f"{filename}.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved chart: {output_path}")

    def plot_monthly_hourly_averages(self, df_hourly: pd.DataFrame):
        """
        Create monthly hourly average charts for each month.
        """
        if df_hourly is None or df_hourly.empty:
            return

        df = self.parse_timestamps(df_hourly, 'Timestamp')

        # Add hour and month columns
        df['Hour'] = df['Timestamp'].dt.hour
        df['Month'] = df['Timestamp'].dt.to_period('M')

        energy_cols = [col for col in df.columns if col not in ['Timestamp', 'Hour', 'Month']]

        # Get unique months
        months = sorted(df['Month'].unique())

        # Create subplots for each month
        n_months = len(months)
        cols = 3  # 3 charts per row
        rows = (n_months + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(20, 6 * rows))
        if rows == 1:
            axes = axes.reshape(1, -1)
        elif n_months == 1:
            axes = axes.reshape(1, 1)

        for i, month in enumerate(months):
            row = i // cols
            col = i % cols
            ax = axes[row, col] if rows > 1 else axes[col]

            # Filter data for this month
            month_data = df[df['Month'] == month]

            # Calculate hourly averages
            hourly_avg = month_data.groupby('Hour')[energy_cols].mean()

            # Plot each energy type
            for energy_col in energy_cols:
                color = self.colors.get(energy_col, None)
                ax.plot(hourly_avg.index, hourly_avg[energy_col],
                       label=energy_col, linewidth=2.5, color=color, marker='o', markersize=4)

            ax.set_title(f'{month} - Średnie godzinowe zużycie energii', fontweight='bold')
            ax.set_xlabel('Godzina')
            ax.set_ylabel('Energia [kWh]')
            ax.grid(True, alpha=0.3)
            ax.legend(fontsize=8)
            ax.set_xticks(range(0, 24, 2))

            # Add zero line for reference
            ax.axhline(y=0, color='red', linestyle='--', alpha=0.5)

        # Hide unused subplots
        for i in range(n_months, rows * cols):
            row = i // cols
            col = i % cols
            axes[row, col].set_visible(False)

        plt.suptitle('Miesięczne profile godzinowe zużycia energii', fontsize=16, fontweight='bold')
        plt.tight_layout()

        output_path = self.output_dir / "monthly_hourly_profiles.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved chart: {output_path}")

    def plot_cost_analysis(self, df: pd.DataFrame, time_col: str, title_suffix: str):
        """
        Create cost analysis charts.

        Args:
            df: DataFrame with energy data
            time_col: Time column name
            title_suffix: Suffix for the title (e.g., 'miesięczne', 'dzienne')
        """
        if df is None or df.empty:
            return

        df = self.parse_timestamps(df, time_col)

        energy_cols = [col for col in df.columns if col != time_col and not col.startswith('Koszt') and not col.startswith('Przychód')]
        df_with_costs = self.calculate_costs(df, energy_cols)

        # Create cost analysis chart
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

        # Energy costs breakdown
        if 'Koszt_pobrana' in df_with_costs.columns:
            ax1.bar(df_with_costs[time_col], df_with_costs['Koszt_pobrana'],
                   label=f'Koszt energii pobranej ({self.buy_price} zł/kWh)',
                   color='#F18F01', alpha=0.7)

        if 'Przychód_oddana' in df_with_costs.columns:
            ax1.bar(df_with_costs[time_col], -df_with_costs['Przychód_oddana'],
                   label=f'Przychód z energii oddanej ({self.sell_price} zł/kWh)',
                   color='#A23B72', alpha=0.7)

        ax1.set_title(f'Analiza kosztów energii - {title_suffix}', fontweight='bold', pad=20)
        ax1.set_xlabel('Okres', fontweight='bold')
        ax1.set_ylabel('Kwota [zł]', fontweight='bold')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)

        # Net cost trend
        if 'Koszt_netto' in df_with_costs.columns:
            colors = ['red' if x > 0 else 'green' for x in df_with_costs['Koszt_netto']]
            ax2.bar(df_with_costs[time_col], df_with_costs['Koszt_netto'],
                   color=colors, alpha=0.7)
            ax2.axhline(y=0, color='black', linestyle='-', alpha=0.5)
            ax2.set_title(f'Koszt netto energii - {title_suffix}', fontweight='bold', pad=20)
            ax2.set_xlabel('Okres', fontweight='bold')
            ax2.set_ylabel('Koszt netto [zł]', fontweight='bold')
            ax2.grid(True, alpha=0.3)

            # Add summary text
            total_cost = df_with_costs['Koszt_netto'].sum()
            color = 'red' if total_cost > 0 else 'green'
            ax2.text(0.02, 0.98, f'Łączny koszt: {total_cost:.2f} zł',
                    transform=ax2.transAxes, fontsize=12, fontweight='bold',
                    verticalalignment='top', color=color,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

        # Format x-axis
        if time_col in ['Timestamp', 'Date', 'WeekStart']:
            for ax in [ax1, ax2]:
                ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
                if len(df_with_costs) > 10:
                    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
                plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

        plt.tight_layout()

        filename = f"cost_analysis_{title_suffix.replace(' ', '_')}"
        output_path = self.output_dir / f"{filename}.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved chart: {output_path}")

    def plot_monthly_comparison(self, df: pd.DataFrame):
        """
        Create monthly comparison charts.
        """
        if df is None or df.empty:
            return

        df = self.parse_timestamps(df, 'Month')

        # Monthly totals comparison
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

        energy_cols = [col for col in df.columns if col != 'Month']

        # Bar chart
        x_pos = range(len(df))
        width = 0.25

        for i, col in enumerate(energy_cols):
            color = self.colors.get(col, None)
            ax1.bar([x + i * width for x in x_pos], df[col], width,
                   label=col, color=color, alpha=0.8)

        ax1.set_title('Miesięczne zużycie energii - porównanie', fontweight='bold', pad=20)
        ax1.set_xlabel('Miesiąc', fontweight='bold')
        ax1.set_ylabel('Energia [kWh]', fontweight='bold')
        ax1.set_xticks([x + width for x in x_pos])
        ax1.set_xticklabels(df['Month'].dt.strftime('%Y-%m'), rotation=45)
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Net energy (zbilansowana) focus
        if 'Czynna zbilansowana' in energy_cols:
            color = self.colors.get('Czynna zbilansowana', 'blue')
            ax2.plot(df['Month'], df['Czynna zbilansowana'],
                    marker='o', linewidth=3, markersize=8, color=color)
            ax2.axhline(y=0, color='red', linestyle='--', alpha=0.7)
            ax2.set_title('Energia zbilansowana - trend miesięczny', fontweight='bold', pad=20)
            ax2.set_xlabel('Miesiąc', fontweight='bold')
            ax2.set_ylabel('Energia zbilansowana [kWh]', fontweight='bold')
            ax2.grid(True, alpha=0.3)
            ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
            plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)

        plt.tight_layout()
        output_path = self.output_dir / "monthly_comparison.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved chart: {output_path}")

    def generate_all_charts(self):
        """
        Generate all available charts for the dataset.
        """
        print(f"Generating charts for: {self.base_filename}")
        print(f"Output directory: {self.output_dir}")
        print(f"Energy prices: Buy={self.buy_price} zł/kWh, Sell={self.sell_price} zł/kWh")

        # Load all data types
        hourly_data = self.load_data('-hourly')
        daily_data = self.load_data('-daily')
        weekly_data = self.load_data('-weekly')
        monthly_data = self.load_data('-monthly')

        # Generate time series charts
        if hourly_data is not None:
            # Sample hourly data for better visualization (first 30 days)
            hourly_sample = hourly_data.head(24 * 30) if len(hourly_data) > 720 else hourly_data
            self.plot_time_series(hourly_sample, 'Timestamp',
                                'Zużycie energii - dane godzinowe (30 dni)', 'hourly_timeseries')

            # Generate monthly hourly averages
            self.plot_monthly_hourly_averages(hourly_data)

        if daily_data is not None:
            self.plot_time_series(daily_data, 'Date',
                                'Zużycie energii - dane dzienne', 'daily_timeseries')

            self.plot_stacked_area(daily_data, 'Date',
                                 'Zużycie energii - obszar skumulowany (dzienne)', 'daily_stacked')

            # Generate daily cost analysis
            self.plot_cost_analysis(daily_data, 'Date', 'dzienne')

        if weekly_data is not None:
            self.plot_time_series(weekly_data, 'WeekStart',
                                'Zużycie energii - dane tygodniowe', 'weekly_timeseries')

        if monthly_data is not None:
            self.plot_time_series(monthly_data, 'Month',
                                'Zużycie energii - dane miesięczne', 'monthly_timeseries')

            self.plot_monthly_comparison(monthly_data)

            # Generate monthly cost analysis
            self.plot_cost_analysis(monthly_data, 'Month', 'miesięczne')

        print("\nChart generation completed!")


def main():
    parser = argparse.ArgumentParser(
        description="Generate visualizations for PGE energy data"
    )
    parser.add_argument(
        "base_filename",
        help="Base filename without suffix (e.g., '202512011130_590543580100122940_46f7640c-07d2-4e64-9ba7-20b8c2eeca7d')"
    )
    parser.add_argument(
        "--data-dir", "-d",
        default=".",
        help="Directory containing CSV files (default: current directory)"
    )
    parser.add_argument(
        "--output-dir", "-o",
        default="charts",
        help="Output directory for charts (default: 'charts')"
    )
    parser.add_argument(
        "--buy-price", "-b",
        type=float,
        default=1.4,
        help="Price per kWh for energy consumed from grid (default: 1.4 PLN)"
    )
    parser.add_argument(
        "--sell-price", "-s",
        type=float,
        default=0.2,
        help="Price per kWh for energy delivered to grid (default: 0.2 PLN)"
    )

    args = parser.parse_args()

    try:
        visualizer = PGEDataVisualizer(
            base_filename=args.base_filename,
            data_dir=args.data_dir,
            output_dir=args.output_dir,
            buy_price=args.buy_price,
            sell_price=args.sell_price
        )

        visualizer.generate_all_charts()

    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0


if __name__ == "__main__":
    exit(main())