Bli med i Kassalapp-fellesskapet på Discord

Kassalapp Data Analytics Guide

En komplett guide for dataanalyse av norske matvarepriser ved bruk av Kassalapp API.

Innholdsfortegnelse

  1. Kom i gang
  2. Autentisering
  3. Datainnsamling
  4. Databehandling
  5. Prisanalyse
  6. Visualisering
  7. Butikksammenligning
  8. Tidsserieanalyse
  9. Maskinlæring
  10. Rapporter
  11. Dashboard
  12. Praktiske eksempler

Kom i gang

Python Environment Setup

Opprett et virtuelt miljø og installer nødvendige pakker:

bash
python -m venv .venv
# Linux/Mac
source .venv/bin/activate  
# Windows
# .venv\Scripts\activate  

pip install pandas==2.3.2 plotly==5.24.1 scikit-learn jupyter requests seaborn matplotlib streamlit dash numpy scipy

Jupyter Notebook Setup

Start Jupyter Notebook for interaktiv analyse:

bash
jupyter notebook

Grunnleggende imports

python
import pandas as pd
import numpy as np
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Sett pandas display opsjoner
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

Autentisering

API Key Management

python
class KassalappAPI:
    def __init__(self, api_key: str):
        self.base_url = "https://kassal.app/api/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
    
    def get(self, endpoint: str, params: dict = None) -> dict:
        """Generisk GET request til API"""
        url = f"{self.base_url}/{endpoint.lstrip('/')}"
        response = self.session.get(url, params=params)
        response.raise_for_status()
        return response.json()
    
    def get_paginated(self, endpoint: str, params: dict = None) -> list:
        """Hent alle sider av paginerte data"""
        all_data = []
        page = 1
        
        while True:
            current_params = (params or {}).copy()
            current_params['page'] = page
            
            data = self.get(endpoint, current_params)
            
            if 'data' in data:
                all_data.extend(data['data'])
                if page >= data.get('last_page', 1):
                    break
            else:
                all_data.extend(data if isinstance(data, list) else [data])
                break
                
            page += 1
        
        return all_data

# Initialiser API client
api = KassalappAPI("din_api_nøkkel_her")

Sikker Key Management

python
import os
from pathlib import Path

def get_api_key():
    """Hent API nøkkel fra miljøvariabel eller fil"""
    # Fra miljøvariabel
    if 'KASSALAPP_API_KEY' in os.environ:
        return os.environ['KASSALAPP_API_KEY']
    
    # Fra fil
    key_file = Path.home() / '.kassalapp_key'
    if key_file.exists():
        return key_file.read_text().strip()
    
    raise ValueError("API nøkkel ikke funnet. Sett KASSALAPP_API_KEY miljøvariabel eller lag ~/.kassalapp_key fil")

api_key = get_api_key()
api = KassalappAPI(api_key)

Datainnsamling

Produktdata

python
def get_products_df(category_id: int = None, limit: int = None) -> pd.DataFrame:
    """Hent produktdata som DataFrame"""
    params = {}
    if category_id:
        params['category_id'] = category_id
    if limit:
        params['limit'] = limit
    
    products = api.get_paginated('products', params)
    df = pd.json_normalize(products)
    
    # Konverter datoer
    date_columns = ['created_at', 'updated_at']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
    
    return df

# Hent alle produkter
products_df = get_products_df()
print(f"Hentet {len(products_df)} produkter")
print(products_df.head())

Prisdata

python
def get_prices_df(product_id: int = None, store_id: int = None, 
                  date_from: str = None, date_to: str = None) -> pd.DataFrame:
    """Hent prisdata som DataFrame"""
    params = {}
    if product_id:
        params['product_id'] = product_id
    if store_id:
        params['store_id'] = store_id
    if date_from:
        params['date_from'] = date_from
    if date_to:
        params['date_to'] = date_to
    
    prices = api.get_paginated('prices', params)
    df = pd.json_normalize(prices)
    
    if not df.empty:
        # Konverter kolonner
        df['price'] = pd.to_numeric(df['price'])
        df['date'] = pd.to_datetime(df['date'])
        df['created_at'] = pd.to_datetime(df['created_at'])
        
        # Sorter etter dato
        df = df.sort_values('date')
    
    return df

# Eksempel: Hent prisdata for siste 30 dager
date_from = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
prices_df = get_prices_df(date_from=date_from)
print(f"Hentet {len(prices_df)} priser")

Butikkdata

python
def get_stores_df() -> pd.DataFrame:
    """Hent butikkdata som DataFrame"""
    stores = api.get_paginated('stores')
    df = pd.json_normalize(stores)
    
    if not df.empty:
        # Konverter koordinater
        if 'latitude' in df.columns:
            df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
        if 'longitude' in df.columns:
            df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
    
    return df

stores_df = get_stores_df()
print(f"Hentet {len(stores_df)} butikker")

Kombinerte datasett

python
def get_complete_dataset(days_back: int = 30) -> pd.DataFrame:
    """Hent komplett datasett med produkter, priser og butikker"""
    
    # Hent alle datasett
    products = get_products_df()
    stores = get_stores_df()
    
    date_from = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
    prices = get_prices_df(date_from=date_from)
    
    if prices.empty:
        print("Ingen prisdata funnet")
        return pd.DataFrame()
    
    # Koble sammen datasett
    df = prices.merge(products, left_on='product_id', right_on='id', 
                      how='left', suffixes=('', '_product'))
    df = df.merge(stores, left_on='store_id', right_on='id', 
                  how='left', suffixes=('', '_store'))
    
    # Rens kolonner
    df = df.drop(columns=[col for col in df.columns if col.endswith('_product') or col.endswith('_store')])
    
    return df

# Hent komplett datasett
complete_df = get_complete_dataset(days_back=90)
print(f"Komplett datasett: {len(complete_df)} rader, {len(complete_df.columns)} kolonner")

Databehandling

DataFrame Operasjoner

python
class DataProcessor:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
    
    def clean_data(self) -> 'DataProcessor':
        """Grunnleggende datarensing"""
        # Fjern duplikater
        self.df = self.df.drop_duplicates()
        
        # Håndter manglende verdier
        numeric_columns = self.df.select_dtypes(include=[np.number]).columns
        self.df[numeric_columns] = self.df[numeric_columns].fillna(0)
        
        # Fjern outliers (IQR metode)
        if 'price' in self.df.columns:
            Q1 = self.df['price'].quantile(0.25)
            Q3 = self.df['price'].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            self.df = self.df[(self.df['price'] >= lower) & (self.df['price'] <= upper)]
        
        return self
    
    def add_price_features(self) -> 'DataProcessor':
        """Legg til prisrelaterte features"""
        if 'price' not in self.df.columns:
            return self
        
        # Pris per kilo/liter hvis weight/volume finnes
        if 'weight' in self.df.columns:
            self.df['price_per_kg'] = self.df['price'] / (self.df['weight'] / 1000)
        
        if 'volume' in self.df.columns:
            self.df['price_per_liter'] = self.df['price'] / (self.df['volume'] / 1000)
        
        # Prisklasser
        self.df['price_category'] = pd.cut(self.df['price'], 
                                         bins=[0, 20, 50, 100, float('inf')], 
                                         labels=['Lav', 'Medium', 'Høy', 'Meget høy'])
        
        return self
    
    def add_time_features(self, date_column: str = 'date') -> 'DataProcessor':
        """Legg til tidsrelaterte features"""
        if date_column not in self.df.columns:
            return self
        
        self.df['year'] = self.df[date_column].dt.year
        self.df['month'] = self.df[date_column].dt.month
        self.df['week'] = self.df[date_column].dt.isocalendar().week
        self.df['day_of_week'] = self.df[date_column].dt.dayofweek
        self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6])
        
        # Sesong
        def get_season(month):
            if month in [12, 1, 2]:
                return 'Vinter'
            elif month in [3, 4, 5]:
                return 'Vår'
            elif month in [6, 7, 8]:
                return 'Sommer'
            else:
                return 'Høst'
        
        self.df['season'] = self.df['month'].apply(get_season)
        
        return self
    
    def get_df(self) -> pd.DataFrame:
        """Returner prosessert DataFrame"""
        return self.df

# Bruk DataProcessor
processor = DataProcessor(complete_df)
processed_df = (processor
                .clean_data()
                .add_price_features()
                .add_time_features()
                .get_df())

print("Prosessert data:")
print(processed_df.info())

Agregering og Gruppering

python
def create_summary_stats(df: pd.DataFrame) -> dict:
    """Lag sammendragsstatistikk"""
    stats = {}
    
    if 'price' in df.columns:
        stats['price_stats'] = {
            'mean': df['price'].mean(),
            'median': df['price'].median(),
            'std': df['price'].std(),
            'min': df['price'].min(),
            'max': df['price'].max(),
            'count': len(df)
        }
    
    if 'store_name' in df.columns:
        stats['store_stats'] = df['store_name'].value_counts().to_dict()
    
    if 'category_name' in df.columns:
        stats['category_stats'] = df['category_name'].value_counts().to_dict()
    
    return stats

# Lag sammendrag
summary = create_summary_stats(processed_df)
print("Prisstatistikk:", summary.get('price_stats', {}))

Prisanalyse

Prisutvikling over tid

python
def analyze_price_trends(df: pd.DataFrame, product_name: str = None) -> pd.DataFrame:
    """Analyser prisutvikling"""
    
    if product_name:
        df = df[df['name'].str.contains(product_name, case=False, na=False)]
    
    # Grupperinger per dag
    daily_prices = df.groupby('date').agg({
        'price': ['mean', 'median', 'min', 'max', 'count']
    }).reset_index()
    
    daily_prices.columns = ['date', 'avg_price', 'median_price', 'min_price', 'max_price', 'count']
    
    # Beregn endringer
    daily_prices['price_change'] = daily_prices['avg_price'].pct_change()
    daily_prices['price_change_abs'] = daily_prices['avg_price'].diff()
    
    # Glattede trends (7-dagers moving average)
    daily_prices['avg_price_ma7'] = daily_prices['avg_price'].rolling(window=7).mean()
    
    return daily_prices

# Analyser melkpriser
milk_trends = analyze_price_trends(processed_df, "melk")
print("Melkpris trends:")
print(milk_trends.tail())

Butikksammenligning

python
def compare_stores(df: pd.DataFrame, category: str = None) -> pd.DataFrame:
    """Sammenlign priser mellom butikker"""
    
    if category:
        df = df[df['category_name'].str.contains(category, case=False, na=False)]
    
    store_comparison = df.groupby(['store_name', 'store_chain']).agg({
        'price': ['mean', 'median', 'count'],
        'name': 'nunique'  # Antall unike produkter
    }).reset_index()
    
    store_comparison.columns = ['store_name', 'store_chain', 'avg_price', 
                               'median_price', 'price_count', 'unique_products']
    
    # Beregn prisindeks (relativ til billigste butikk)
    min_price = store_comparison['avg_price'].min()
    store_comparison['price_index'] = (store_comparison['avg_price'] / min_price) * 100
    
    # Sorter etter gjennomsnittspris
    store_comparison = store_comparison.sort_values('avg_price')
    
    return store_comparison

# Sammenlign butikker
store_comparison = compare_stores(processed_df)
print("Butikksammenligning:")
print(store_comparison.head(10))

Sesonganalyse

python
def seasonal_analysis(df: pd.DataFrame) -> pd.DataFrame:
    """Analyser sesongvariasjoner"""
    
    seasonal_stats = df.groupby(['season', 'month']).agg({
        'price': ['mean', 'std', 'count']
    }).reset_index()
    
    seasonal_stats.columns = ['season', 'month', 'avg_price', 'price_std', 'count']
    
    # Beregn sesongindekser
    overall_avg = df['price'].mean()
    seasonal_stats['seasonal_index'] = (seasonal_stats['avg_price'] / overall_avg) * 100
    
    return seasonal_stats

seasonal_data = seasonal_analysis(processed_df)
print("Sesonganalyse:")
print(seasonal_data)

Visualisering

Prisutvikling grafer

python
def plot_price_trends(df: pd.DataFrame, title: str = "Prisutvikling"):
    """Plot prisutvikling over tid"""
    
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Prisutvikling', 'Prisendringer (%)'),
        vertical_spacing=0.1
    )
    
    # Hovedpris plot
    fig.add_trace(
        go.Scatter(x=df['date'], y=df['avg_price'], 
                  name='Gjennomsnitt', line=dict(color='blue')),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=df['date'], y=df['avg_price_ma7'], 
                  name='7-dagers glidende snitt', line=dict(color='red')),
        row=1, col=1
    )
    
    # Prisendringer
    colors = ['green' if x >= 0 else 'red' for x in df['price_change']]
    fig.add_trace(
        go.Bar(x=df['date'], y=df['price_change'] * 100, 
               name='Prisendring (%)', marker_color=colors),
        row=2, col=1
    )
    
    fig.update_layout(
        title=title,
        height=600,
        showlegend=True
    )
    
    fig.update_xaxes(title_text="Dato")
    fig.update_yaxes(title_text="Pris (NOK)", row=1, col=1)
    fig.update_yaxes(title_text="Endring (%)", row=2, col=1)
    
    return fig

# Plot melkpris trends
if not milk_trends.empty:
    fig = plot_price_trends(milk_trends, "Melkpris Utvikling")
    fig.show()

Butikksammenligning visualisering

python
def plot_store_comparison(df: pd.DataFrame, top_n: int = 15):
    """Visualiser butikksammenligning"""
    
    top_stores = df.head(top_n)
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Gjennomsnittspris per butikk', 'Prisindeks'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Gjennomsnittspris
    fig.add_trace(
        go.Bar(x=top_stores['avg_price'], 
               y=top_stores['store_name'],
               orientation='h',
               name='Gjennomsnittspris',
               marker_color='lightblue'),
        row=1, col=1
    )
    
    # Prisindeks
    colors = ['red' if x > 100 else 'green' for x in top_stores['price_index']]
    fig.add_trace(
        go.Bar(x=top_stores['price_index'], 
               y=top_stores['store_name'],
               orientation='h',
               name='Prisindeks',
               marker_color=colors),
        row=1, col=2
    )
    
    fig.update_layout(
        title="Butikksammenligning - Prisanalyse",
        height=600,
        showlegend=False
    )
    
    fig.update_xaxes(title_text="Pris (NOK)", row=1, col=1)
    fig.update_xaxes(title_text="Prisindeks (100 = billigst)", row=1, col=2)
    
    return fig

# Plot butikksammenligning
if not store_comparison.empty:
    fig = plot_store_comparison(store_comparison)
    fig.show()

Interaktive dashboards

python
def create_interactive_dashboard(df: pd.DataFrame):
    """Lag interaktivt dashboard"""
    
    # Prisfordeling histogram
    fig1 = px.histogram(df, x='price', nbins=50, 
                       title='Prisfordeling',
                       labels={'price': 'Pris (NOK)', 'count': 'Antall'})
    
    # Pris per kategori boxplot
    fig2 = px.box(df, x='category_name', y='price',
                  title='Prisfordeling per kategori')
    fig2.update_xaxes(tickangle=45)
    
    # Prisutvikling scatter
    daily_avg = df.groupby('date')['price'].mean().reset_index()
    fig3 = px.scatter(daily_avg, x='date', y='price',
                     title='Gjennomsnittspris over tid',
                     trendline='lowess')
    
    # Vis alle grafer
    fig1.show()
    fig2.show() 
    fig3.show()

# Lag dashboard
create_interactive_dashboard(processed_df)

Butikksammenligning

Geografisk analyse

python
def geographic_analysis(df: pd.DataFrame) -> pd.DataFrame:
    """Analyser prisvariasjoner geografisk"""
    
    if 'latitude' not in df.columns or 'longitude' not in df.columns:
        print("Mangler geografiske koordinater")
        return pd.DataFrame()
    
    # Grupperinger per butikk med koordinater
    geo_analysis = df.groupby(['store_name', 'latitude', 'longitude']).agg({
        'price': ['mean', 'count']
    }).reset_index()
    
    geo_analysis.columns = ['store_name', 'latitude', 'longitude', 'avg_price', 'count']
    
    # Fjern butikker med få priser
    geo_analysis = geo_analysis[geo_analysis['count'] >= 10]
    
    return geo_analysis

def plot_geographic_prices(geo_df: pd.DataFrame):
    """Plot geografisk prisfordeling"""
    
    fig = px.scatter_mapbox(
        geo_df,
        lat="latitude",
        lon="longitude",
        color="avg_price",
        size="count",
        hover_data=['store_name', 'avg_price', 'count'],
        color_continuous_scale="RdYlBu_r",
        title="Geografisk Prisfordeling",
        mapbox_style="open-street-map",
        height=600
    )
    
    fig.update_layout(
        mapbox=dict(
            center=dict(lat=60.0, lon=10.0),  # Norge sentrum
            zoom=5
        )
    )
    
    return fig

# Geografisk analyse
geo_data = geographic_analysis(processed_df)
if not geo_data.empty:
    fig = plot_geographic_prices(geo_data)
    fig.show()

Konkurranse analyse

python
def competition_analysis(df: pd.DataFrame) -> dict:
    """Analyser konkurranse mellom butikkjeder"""
    
    chain_stats = df.groupby('store_chain').agg({
        'price': ['mean', 'std', 'min', 'max'],
        'store_name': 'nunique',
        'name': 'nunique'
    }).reset_index()
    
    chain_stats.columns = ['store_chain', 'avg_price', 'price_std', 'min_price', 
                          'max_price', 'store_count', 'product_count']
    
    # Beregn markedsandel (basert på antall priser)
    total_prices = len(df)
    chain_market_share = df['store_chain'].value_counts()
    chain_stats['market_share'] = chain_stats['store_chain'].map(
        lambda x: (chain_market_share.get(x, 0) / total_prices) * 100
    )
    
    # Konkurranseindeks (lavere pris + høy markedsandel = bedre)
    chain_stats['competition_score'] = (
        (1 / chain_stats['avg_price']) * chain_stats['market_share']
    )
    
    return {
        'chain_stats': chain_stats.sort_values('avg_price'),
        'market_share': chain_market_share
    }

competition = competition_analysis(processed_df)
print("Butikkjede analyse:")
print(competition['chain_stats'])

Tidsserieanalyse

Forecasting med Prophet

python
# Installer prophet: pip install prophet
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

def forecast_prices(df: pd.DataFrame, product_name: str, days_ahead: int = 30) -> dict:
    """Lag prisforecast med Prophet"""
    
    # Filtrer data for spesifikt produkt
    product_df = df[df['name'].str.contains(product_name, case=False, na=False)]
    
    if product_df.empty:
        return {"error": f"Ingen data funnet for {product_name}"}
    
    # Forbered data for Prophet
    prophet_df = product_df.groupby('date')['price'].mean().reset_index()
    prophet_df.columns = ['ds', 'y']  # Prophet krever disse kolonnenavnene
    
    # Tren modell
    model = Prophet(
        daily_seasonality=False,
        weekly_seasonality=True,
        yearly_seasonality=True,
        changepoint_prior_scale=0.05
    )
    
    model.fit(prophet_df)
    
    # Lag forecast
    future = model.make_future_dataframe(periods=days_ahead)
    forecast = model.predict(future)
    
    return {
        'model': model,
        'forecast': forecast,
        'historical': prophet_df
    }

# Eksempel: Forecast melkpriser
milk_forecast = forecast_prices(processed_df, "melk", days_ahead=60)

if 'error' not in milk_forecast:
    # Plot forecast
    fig1 = plot_plotly(milk_forecast['model'], milk_forecast['forecast'])
    fig1.update_layout(title="Melkpris Forecast")
    fig1.show()
    
    # Plot komponenter
    fig2 = plot_components_plotly(milk_forecast['model'], milk_forecast['forecast'])
    fig2.show()

Sesongdekomposisjon

python
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

def decompose_time_series(df: pd.DataFrame, product_name: str):
    """Dekomponerer tidsserier i trend, sesong og residualer"""
    
    # Filtrer og forbered data
    product_df = df[df['name'].str.contains(product_name, case=False, na=False)]
    daily_prices = product_df.groupby('date')['price'].mean()
    
    # Resample til ukentlige data for bedre sesongmønster
    weekly_prices = daily_prices.resample('W').mean().fillna(method='ffill')
    
    if len(weekly_prices) < 104:  # Trenger minst 2 år med ukentlige data
        print(f"For lite data for {product_name} (trenger minst 104 uker)")
        return None
    
    # Sesongdekomposisjon
    decomposition = seasonal_decompose(weekly_prices, 
                                     model='additive', 
                                     period=52)  # 52 uker = 1 år
    
    # Plot
    fig, axes = plt.subplots(4, 1, figsize=(15, 12))
    
    decomposition.observed.plot(ax=axes[0], title=f'{product_name} - Original')
    decomposition.trend.plot(ax=axes[1], title='Trend')
    decomposition.seasonal.plot(ax=axes[2], title='Sesong')
    decomposition.resid.plot(ax=axes[3], title='Residualer')
    
    plt.tight_layout()
    plt.show()
    
    return decomposition

# Dekomponering av melkpriser
milk_decomp = decompose_time_series(processed_df, "melk")

Maskinlæring

Prisforecast med Machine Learning

python
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

class PricePredictionModel:
    def __init__(self):
        self.model = None
        self.label_encoders = {}
        self.feature_columns = []
    
    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Forbered features for modellering"""
        
        features_df = df.copy()
        
        # Encode kategoriske variabler
        categorical_columns = ['store_chain', 'category_name', 'brand_name', 'season']
        
        for col in categorical_columns:
            if col in features_df.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    features_df[col] = self.label_encoders[col].fit_transform(
                        features_df[col].astype(str)
                    )
                else:
                    features_df[col] = self.label_encoders[col].transform(
                        features_df[col].astype(str)
                    )
        
        # Velg numeriske features
        numeric_features = ['year', 'month', 'week', 'day_of_week', 'is_weekend']
        categorical_encoded = [col for col in categorical_columns if col in features_df.columns]
        
        self.feature_columns = numeric_features + categorical_encoded
        feature_columns_available = [col for col in self.feature_columns if col in features_df.columns]
        
        return features_df[feature_columns_available]
    
    def train(self, df: pd.DataFrame, target_column: str = 'price'):
        """Tren prisforecasting modell"""
        
        # Forbered features
        X = self.prepare_features(df)
        y = df[target_column]
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Grid search for beste parametre
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
        
        rf = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error')
        grid_search.fit(X_train, y_train)
        
        self.model = grid_search.best_estimator_
        
        # Evaluering
        y_pred = self.model.predict(X_test)
        
        metrics = {
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred),
            'best_params': grid_search.best_params_
        }
        
        return metrics, X_test, y_test, y_pred
    
    def predict(self, df: pd.DataFrame) -> np.ndarray:
        """Forutsi priser"""
        if self.model is None:
            raise ValueError("Modell må trenes først")
        
        X = self.prepare_features(df)
        return self.model.predict(X)
    
    def feature_importance(self) -> pd.DataFrame:
        """Få feature importance"""
        if self.model is None:
            return pd.DataFrame()
        
        importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return importance_df

# Tren modell
price_model = PricePredictionModel()
metrics, X_test, y_test, y_pred = price_model.train(processed_df)

print("Modell ytelse:")
print(f"MAE: {metrics['mae']:.2f} NOK")
print(f"RMSE: {metrics['rmse']:.2f} NOK") 
print(f"R²: {metrics['r2']:.3f}")
print(f"Best params: {metrics['best_params']}")

# Feature importance
importance = price_model.feature_importance()
print("\nFeature Importance:")
print(importance)

Anomali deteksjon

python
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

def detect_price_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """Finn unormale priser med Isolation Forest"""
    
    # Forbered data
    price_features = df[['price', 'year', 'month', 'day_of_week']].copy()
    price_features = price_features.fillna(price_features.mean())
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    anomalies = iso_forest.fit_predict(price_features)
    
    # Legg til anomali flagg
    df_with_anomalies = df.copy()
    df_with_anomalies['is_anomaly'] = anomalies == -1
    
    return df_with_anomalies

# Finn anomalier
df_anomalies = detect_price_anomalies(processed_df)
anomalies_found = df_anomalies[df_anomalies['is_anomaly']]

print(f"Fant {len(anomalies_found)} prisanomalier")
print("Eksempler på anomalier:")
print(anomalies_found[['name', 'store_name', 'price', 'date']].head())

Rapporter

Automatiserte rapporter

python
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

class PriceReport:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.report_date = datetime.now().strftime('%Y-%m-%d')
    
    def generate_summary_stats(self) -> dict:
        """Generer sammendragsstatistikk"""
        return {
            'total_products': self.df['name'].nunique(),
            'total_prices': len(self.df),
            'avg_price': self.df['price'].mean(),
            'median_price': self.df['price'].median(),
            'price_range': self.df['price'].max() - self.df['price'].min(),
            'stores_count': self.df['store_name'].nunique(),
            'categories_count': self.df['category_name'].nunique(),
            'date_range': {
                'from': self.df['date'].min().strftime('%Y-%m-%d'),
                'to': self.df['date'].max().strftime('%Y-%m-%d')
            }
        }
    
    def generate_markdown_report(self) -> str:
        """Generer Markdown rapport"""
        stats = self.generate_summary_stats()
        
        markdown_report = f"""
# Kassalapp Prisrapport
**Generert:** {self.report_date}

## Sammendrag
- **Antall produkter:** {stats['total_products']:,}
- **Antall priser:** {stats['total_prices']:,}
- **Gjennomsnittspris:** {stats['avg_price']:.2f} NOK
- **Medianpris:** {stats['median_price']:.2f} NOK
- **Prisområde:** {stats['price_range']:.2f} NOK
- **Antall butikker:** {stats['stores_count']}
- **Antall kategorier:** {stats['categories_count']}
- **Dataperiode:** {stats['date_range']['from']} til {stats['date_range']['to']}

## Top 10 Dyreste Produkter
"""
        
        # Top dyre produkter
        expensive_products = self.df.nlargest(10, 'price')[['name', 'store_name', 'price']]
        for _, row in expensive_products.iterrows():
            markdown_report += f"- **{row['name']}** - {row['store_name']} - {row['price']:.2f} NOK\n"
        
        markdown_report += "\n## Top 10 Billigste Butikker (gjennomsnitt)\n"
        
        # Billigste butikker
        cheap_stores = self.df.groupby('store_name')['price'].mean().nsmallest(10)
        for store, price in cheap_stores.items():
            markdown_report += f"- **{store}** - {price:.2f} NOK\n"
        
        return markdown_report
    
    def save_report(self, filename: str = None):
        """Lagre rapport til fil"""
        if filename is None:
            filename = f"kassalapp_rapport_{self.report_date}.md"
        
        report_content = self.generate_markdown_report()
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(report_content)
        
        print(f"Rapport lagret som {filename}")
        return filename

# Generer rapport
reporter = PriceReport(processed_df)
report_file = reporter.save_report()
print(reporter.generate_markdown_report())

Excel rapporter

python
def create_excel_report(df: pd.DataFrame, filename: str = None):
    """Lag Excel rapport med multiple sheets"""
    
    if filename is None:
        filename = f"kassalapp_rapport_{datetime.now().strftime('%Y%m%d')}.xlsx"
    
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        
        # Summary sheet
        summary_stats = {
            'Metrikk': ['Antall produkter', 'Antall priser', 'Gjennomsnittspris', 
                       'Medianpris', 'Antall butikker'],
            'Verdi': [df['name'].nunique(), len(df), df['price'].mean(), 
                     df['price'].median(), df['store_name'].nunique()]
        }
        
        pd.DataFrame(summary_stats).to_excel(writer, sheet_name='Sammendrag', index=False)
        
        # Top produkter
        expensive = df.nlargest(50, 'price')[['name', 'store_name', 'price', 'date']]
        expensive.to_excel(writer, sheet_name='Dyreste_Produkter', index=False)
        
        # Butikksammenligning
        store_stats = df.groupby('store_name').agg({
            'price': ['mean', 'median', 'count'],
            'name': 'nunique'
        }).reset_index()
        store_stats.columns = ['Butikk', 'Gj.snitt', 'Median', 'Antall_Priser', 'Unike_Produkter']
        store_stats.to_excel(writer, sheet_name='Butikker', index=False)
        
        # Kategorier
        category_stats = df.groupby('category_name').agg({
            'price': ['mean', 'count']
        }).reset_index()
        category_stats.columns = ['Kategori', 'Gj.snitt_Pris', 'Antall']
        category_stats.to_excel(writer, sheet_name='Kategorier', index=False)
    
    print(f"Excel rapport lagret som {filename}")

# Lag Excel rapport
create_excel_report(processed_df)

Dashboard

Streamlit Dashboard

python
# Lagre som streamlit_app.py og kjør: streamlit run streamlit_app.py
import streamlit as st
import pandas as pd
import plotly.express as px

def create_streamlit_dashboard():
    st.set_page_config(page_title="Kassalapp Analytics", layout="wide")
    
    st.title("🛒 Kassalapp Prisanalyse Dashboard")
    st.sidebar.title("Navigasjon")
    
    # Last inn data (i produksjon, bruk caching)
    @st.cache_data
    def load_data():
        # Her ville du lastet data fra API
        return processed_df
    
    df = load_data()
    
    # Sidebar filtre
    st.sidebar.header("Filtre")
    
    selected_stores = st.sidebar.multiselect(
        "Velg butikker:",
        options=df['store_name'].unique(),
        default=df['store_name'].unique()[:5]
    )
    
    selected_categories = st.sidebar.multiselect(
        "Velg kategorier:",
        options=df['category_name'].unique(),
        default=df['category_name'].unique()[:3]
    )
    
    # Filtrer data
    filtered_df = df[
        (df['store_name'].isin(selected_stores)) &
        (df['category_name'].isin(selected_categories))
    ]
    
    # Hovedinnhold
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Antall produkter", filtered_df['name'].nunique())
    
    with col2:
        st.metric("Gjennomsnittspris", f"{filtered_df['price'].mean():.2f} NOK")
    
    with col3:
        st.metric("Antall butikker", filtered_df['store_name'].nunique())
    
    with col4:
        st.metric("Totalt antall priser", len(filtered_df))
    
    # Visualiseringer
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("Prisfordeling")
        fig_hist = px.histogram(filtered_df, x='price', nbins=30)
        st.plotly_chart(fig_hist, use_container_width=True)
    
    with col2:
        st.subheader("Gjennomsnittspris per butikk")
        store_avg = filtered_df.groupby('store_name')['price'].mean().sort_values()
        fig_bar = px.bar(x=store_avg.values, y=store_avg.index, orientation='h')
        st.plotly_chart(fig_bar, use_container_width=True)
    
    # Tidsserier
    st.subheader("Prisutvikling over tid")
    daily_avg = filtered_df.groupby('date')['price'].mean().reset_index()
    fig_line = px.line(daily_avg, x='date', y='price')
    st.plotly_chart(fig_line, use_container_width=True)
    
    # Datatabell
    st.subheader("Rå data")
    st.dataframe(filtered_df.head(1000))

# For å kjøre dashboard:
# streamlit run dashboard.py

Dash Dashboard

python
import dash
from dash import dcc, html, Input, Output
import plotly.express as px

def create_dash_dashboard():
    app = dash.Dash(__name__)
    
    app.layout = html.Div([
        html.H1("Kassalapp Analytics Dashboard", 
               style={'text-align': 'center'}),
        
        html.Div([
            html.Div([
                html.Label("Velg butikkjeder:"),
                dcc.Dropdown(
                    id='store-dropdown',
                    options=[{'label': store, 'value': store} 
                            for store in processed_df['store_chain'].unique()],
                    value=processed_df['store_chain'].unique()[:3],
                    multi=True
                )
            ], className='six columns'),
            
            html.Div([
                html.Label("Velg tidsperiode:"),
                dcc.DatePickerRange(
                    id='date-picker',
                    start_date=processed_df['date'].min(),
                    end_date=processed_df['date'].max()
                )
            ], className='six columns'),
        ], className='row'),
        
        html.Div([
            dcc.Graph(id='price-trend-graph')
        ]),
        
        html.Div([
            html.Div([
                dcc.Graph(id='store-comparison')
            ], className='six columns'),
            
            html.Div([
                dcc.Graph(id='category-analysis')
            ], className='six columns'),
        ], className='row')
    ])
    
    @app.callback(
        [Output('price-trend-graph', 'figure'),
         Output('store-comparison', 'figure'),
         Output('category-analysis', 'figure')],
        [Input('store-dropdown', 'value'),
         Input('date-picker', 'start_date'),
         Input('date-picker', 'end_date')]
    )
    def update_graphs(selected_stores, start_date, end_date):
        filtered_df = processed_df[
            (processed_df['store_chain'].isin(selected_stores)) &
            (processed_df['date'] >= start_date) &
            (processed_df['date'] <= end_date)
        ]
        
        # Prisutvikling
        daily_prices = filtered_df.groupby('date')['price'].mean().reset_index()
        trend_fig = px.line(daily_prices, x='date', y='price', 
                           title='Prisutvikling over tid')
        
        # Butikksammenligning
        store_avg = filtered_df.groupby('store_name')['price'].mean().sort_values()
        store_fig = px.bar(x=store_avg.index, y=store_avg.values,
                          title='Gjennomsnittspris per butikk')
        
        # Kategorianalyse
        cat_avg = filtered_df.groupby('category_name')['price'].mean().sort_values()
        cat_fig = px.bar(x=cat_avg.values, y=cat_avg.index, orientation='h',
                        title='Gjennomsnittspris per kategori')
        
        return trend_fig, store_fig, cat_fig
    
    return app

# Kjør dashboard
# app = create_dash_dashboard()
# app.run_server(debug=True)

Praktiske eksempler

Case 1: Melkpris analyse

python
def melkpris_case_study(df: pd.DataFrame):
    """Komplett case study av melkpriser"""
    
    print("=== MELKPRIS ANALYSE ===\n")
    
    # 1. Filtrer melkprodukter
    melk_df = df[df['name'].str.contains('melk', case=False, na=False)]
    print(f"Fant {len(melk_df)} melkpriser")
    
    # 2. Grunnleggende statistikk
    print(f"Gjennomsnittspris: {melk_df['price'].mean():.2f} NOK")
    print(f"Medianpris: {melk_df['price'].median():.2f} NOK")
    print(f"Prisområde: {melk_df['price'].min():.2f} - {melk_df['price'].max():.2f} NOK\n")
    
    # 3. Beste tilbud
    print("=== BESTE MELKTILBUD ===")
    best_deals = melk_df.nsmallest(10, 'price')[['name', 'store_name', 'price']]
    for _, row in best_deals.iterrows():
        print(f"{row['name'][:30]:30} - {row['store_name']:15} - {row['price']:6.2f} NOK")
    
    # 4. Butikksammenligning
    print("\n=== BUTIKKSAMMENLIGNING MELK ===")
    store_melk = melk_df.groupby('store_name')['price'].agg(['mean', 'count']).sort_values('mean')
    store_melk = store_melk[store_melk['count'] >= 5]  # Kun butikker med minst 5 melkepriser
    
    for store, data in store_melk.head(10).iterrows():
        print(f"{store:20} - Gj.snitt: {data['mean']:6.2f} NOK ({data['count']} produkter)")
    
    # 5. Tidsutvikling
    print("\n=== PRISUTVIKLING ===")
    daily_melk = melk_df.groupby('date')['price'].mean()
    weekly_change = daily_melk.pct_change(periods=7).iloc[-1] * 100
    monthly_change = daily_melk.pct_change(periods=30).iloc[-1] * 100
    
    print(f"Endring siste uke: {weekly_change:+.1f}%")
    print(f"Endring siste måned: {monthly_change:+.1f}%")
    
    return melk_df

melk_analysis = melkpris_case_study(processed_df)

Case 2: Sesongvariasjoner i frukt/grønt

python
def seasonal_produce_analysis(df: pd.DataFrame):
    """Analyser sesongvariasjoner i frukt og grønnsaker"""
    
    # Frukt og grønt kategorier
    produce_categories = ['Frukt', 'Grønnsaker', 'Poteter', 'Bær']
    produce_df = df[df['category_name'].isin(produce_categories)]
    
    if produce_df.empty:
        print("Ingen frukt/grønt data funnet")
        return
    
    print("=== SESONGANALYSE FRUKT/GRØNT ===\n")
    
    # Månedlig prisutvikling per kategori
    monthly_prices = produce_df.groupby(['month', 'category_name'])['price'].mean().unstack()
    
    # Finn mest sesongavhengige produkter
    seasonal_variation = {}
    
    for category in produce_categories:
        if category in monthly_prices.columns:
            prices = monthly_prices[category].dropna()
            if len(prices) > 0:
                cv = prices.std() / prices.mean()  # Coefficient of variation
                seasonal_variation[category] = cv
    
    print("Sesongvariasjon (høyere = mer sesongavhengig):")
    for category, variation in sorted(seasonal_variation.items(), 
                                    key=lambda x: x[1], reverse=True):
        print(f"{category:15} - CV: {variation:.3f}")
    
    # Plot sesongmønster
    if not monthly_prices.empty:
        fig = px.line(monthly_prices.reset_index(), 
                     x='month', y=monthly_prices.columns,
                     title='Sesongvariasjoner Frukt/Grønt')
        fig.show()

seasonal_produce_analysis(processed_df)

Case 3: Butikkjede loyalitetsprogramanalyse

python
def loyalty_program_analysis(df: pd.DataFrame):
    """Analyser effekt av loyalitetsprogram på priser"""
    
    # Definere butikker med kjente loyalitetsprogrammer
    loyalty_stores = {
        'REMA 1000': 'Æ',
        'Coop': 'Coop Medlemspris', 
        'Kiwi': 'Kiwi Pluss',
        'Meny': 'Meny Pluss'
    }
    
    loyalty_analysis = {}
    
    for store_chain, program in loyalty_stores.items():
        chain_df = df[df['store_chain'] == store_chain]
        
        if chain_df.empty:
            continue
        
        # Sammenlign med andre butikker
        other_stores = df[df['store_chain'] != store_chain]
        
        if not other_stores.empty:
            chain_avg = chain_df['price'].mean()
            others_avg = other_stores['price'].mean()
            
            price_difference = ((chain_avg - others_avg) / others_avg) * 100
            
            loyalty_analysis[store_chain] = {
                'program': program,
                'avg_price': chain_avg,
                'price_vs_competitors': price_difference,
                'product_count': len(chain_df)
            }
    
    print("=== LOYALITETSPROGRAM ANALYSE ===\n")
    
    for chain, data in loyalty_analysis.items():
        print(f"{chain} ({data['program']}):")
        print(f"  Gjennomsnittspris: {data['avg_price']:.2f} NOK")
        print(f"  vs Konkurrenter: {data['price_vs_competitors']:+.1f}%")
        print(f"  Antall produkter: {data['product_count']}")
        print()

loyalty_program_analysis(processed_df)

Case 4: Inflasjonsanalyse

python
def inflation_analysis(df: pd.DataFrame):
    """Analyser matvareinflasjon basert på prisdata"""
    
    # Beregn månedlig inflasjon
    monthly_avg = df.groupby(df['date'].dt.to_period('M'))['price'].mean()
    monthly_inflation = monthly_avg.pct_change() * 100
    
    # Årlig inflasjon (12 måneders rullerende)
    annual_inflation = monthly_avg.pct_change(periods=12) * 100
    
    print("=== MATVAREINFLASJON ANALYSE ===\n")
    
    print("Siste 6 måneders inflasjon:")
    for period, inflation in monthly_inflation.tail(6).items():
        print(f"{period}: {inflation:+.1f}%")
    
    print(f"\nÅrlig inflasjon (siste 12 mnd): {annual_inflation.iloc[-1]:+.1f}%")
    
    # Kategorivis inflasjon
    category_inflation = df.groupby(['category_name', df['date'].dt.to_period('M')])['price'].mean()
    category_inflation = category_inflation.groupby('category_name').pct_change(periods=12).iloc[-1] * 100
    
    print("\nInflasjon per kategori (årlig):")
    for category, inflation in category_inflation.sort_values(ascending=False).head(10).items():
        print(f"{category:20}: {inflation:+.1f}%")

inflation_analysis(processed_df)

Case 5: Konkurranseanalyse

python
def competition_market_analysis(df: pd.DataFrame):
    """Detaljert konkurranseanalyse av matvaremarkedet"""
    
    print("=== KONKURRANSEANALYSE ===\n")
    
    # 1. Markedsandeler (basert på antall produktlistinger)
    market_share = df['store_chain'].value_counts()
    total_listings = len(df)
    
    print("Markedsandeler (basert på produktlistinger):")
    for chain, count in market_share.head(10).items():
        percentage = (count / total_listings) * 100
        print(f"{chain:15}: {percentage:5.1f}% ({count:,} produkter)")
    
    # 2. Prisposisjonering
    print("\n=== PRISPOSISJONERING ===")
    chain_pricing = df.groupby('store_chain')['price'].agg(['mean', 'median', 'std']).sort_values('mean')
    
    for chain, data in chain_pricing.head(10).iterrows():
        print(f"{chain:15}: Gj.snitt {data['mean']:6.2f} NOK, Median {data['median']:6.2f} NOK")
    
    # 3. Priskonkurranse intensitet (prisspredning)
    print("\n=== KONKURRANSEINTENSITET ===")
    # Høy standardavvik = høy konkurranse
    competition_intensity = chain_pricing['std'].sort_values(ascending=False)
    
    for chain, std in competition_intensity.head(5).items():
        avg_price = chain_pricing.loc[chain, 'mean']
        cv = std / avg_price  # Coefficient of variation
        print(f"{chain:15}: Prisspredning {std:5.2f} NOK (CV: {cv:.2f})")
    
    # 4. Premium vs Budget posisjonering
    overall_median = df['price'].median()
    
    premium_chains = chain_pricing[chain_pricing['median'] > overall_median * 1.1]
    budget_chains = chain_pricing[chain_pricing['median'] < overall_median * 0.9]
    
    print(f"\nPremium butikker (>10% over median {overall_median:.2f} NOK):")
    for chain in premium_chains.index[:5]:
        print(f"  {chain}")
    
    print(f"\nBudget butikker (<10% under median):")
    for chain in budget_chains.index[:5]:
        print(f"  {chain}")

competition_market_analysis(processed_df)

Avanserte analyseteknikker

Priskorrelasjon og markedsegmenter

python
def market_segmentation_analysis(df: pd.DataFrame):
    """Avansert markedssegmenteringsanalyse"""
    
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    
    # Forbered data for clustering
    store_features = df.groupby('store_name').agg({
        'price': ['mean', 'std', 'min', 'max'],
        'name': 'nunique',
        'category_name': 'nunique'
    }).reset_index()
    
    store_features.columns = ['store_name', 'avg_price', 'price_std', 'min_price', 
                             'max_price', 'product_count', 'category_count']
    
    # Standardiser features
    features = ['avg_price', 'price_std', 'product_count', 'category_count']
    X = store_features[features].fillna(0)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-means clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    store_features['segment'] = kmeans.fit_predict(X_scaled)
    
    # Analyser segmenter
    segment_analysis = store_features.groupby('segment').agg({
        'avg_price': ['mean', 'std'],
        'product_count': 'mean',
        'category_count': 'mean',
        'store_name': 'count'
    }).round(2)
    
    print("=== MARKEDSSEGMENTER ===")
    print(segment_analysis)
    
    # Plot segmenter
    fig = px.scatter(store_features, 
                    x='avg_price', 
                    y='product_count',
                    color='segment',
                    hover_data=['store_name'],
                    title='Butikksegmentering')
    fig.show()
    
    return store_features

segmentation = market_segmentation_analysis(processed_df)

Dette dokumentet gir et komplett rammeverk for dataanalyse av norske matvarepriser ved bruk av Kassalapp API. Hver seksjon inneholder praktiske eksempler som kan tilpasses spesifikke analysebehov.

For å komme i gang:

  1. Installer nødvendige pakker
  2. Få API-tilgang fra Kassalapp
  3. Kjør eksempelkoden i Jupyter Notebook
  4. Tilpass analysene til dine behov

Med disse verktøyene kan du utføre alt fra enkle prissammenligninger til avanserte forecasting-modeller og markedsanalyser.