Kassalapp Data Analytics Guide
En komplett guide for dataanalyse av norske matvarepriser ved bruk av Kassalapp API.
Innholdsfortegnelse
- Kom i gang
- Autentisering
- Datainnsamling
- Databehandling
- Prisanalyse
- Visualisering
- Butikksammenligning
- Tidsserieanalyse
- Maskinlæring
- Rapporter
- Dashboard
- Praktiske eksempler
Kom i gang
Python Environment Setup
Opprett et virtuelt miljø og installer nødvendige pakker:
bash
python -m venv .venv
# Linux/Mac
source .venv/bin/activate
# Windows
# .venv\Scripts\activate
pip install pandas==2.3.2 plotly==5.24.1 scikit-learn jupyter requests seaborn matplotlib streamlit dash numpy scipy
Jupyter Notebook Setup
Start Jupyter Notebook for interaktiv analyse:
bash
jupyter notebook
Grunnleggende imports
python
import pandas as pd
import numpy as np
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
# Sett pandas display opsjoner
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
Autentisering
API Key Management
python
class KassalappAPI:
def __init__(self, api_key: str):
self.base_url = "https://kassal.app/api/v1"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"Accept": "application/json"
}
self.session = requests.Session()
self.session.headers.update(self.headers)
def get(self, endpoint: str, params: dict = None) -> dict:
"""Generisk GET request til API"""
url = f"{self.base_url}/{endpoint.lstrip('/')}"
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()
def get_paginated(self, endpoint: str, params: dict = None) -> list:
"""Hent alle sider av paginerte data"""
all_data = []
page = 1
while True:
current_params = (params or {}).copy()
current_params['page'] = page
data = self.get(endpoint, current_params)
if 'data' in data:
all_data.extend(data['data'])
if page >= data.get('last_page', 1):
break
else:
all_data.extend(data if isinstance(data, list) else [data])
break
page += 1
return all_data
# Initialiser API client
api = KassalappAPI("din_api_nøkkel_her")
Sikker Key Management
python
import os
from pathlib import Path
def get_api_key():
"""Hent API nøkkel fra miljøvariabel eller fil"""
# Fra miljøvariabel
if 'KASSALAPP_API_KEY' in os.environ:
return os.environ['KASSALAPP_API_KEY']
# Fra fil
key_file = Path.home() / '.kassalapp_key'
if key_file.exists():
return key_file.read_text().strip()
raise ValueError("API nøkkel ikke funnet. Sett KASSALAPP_API_KEY miljøvariabel eller lag ~/.kassalapp_key fil")
api_key = get_api_key()
api = KassalappAPI(api_key)
Datainnsamling
Produktdata
python
def get_products_df(category_id: int = None, limit: int = None) -> pd.DataFrame:
"""Hent produktdata som DataFrame"""
params = {}
if category_id:
params['category_id'] = category_id
if limit:
params['limit'] = limit
products = api.get_paginated('products', params)
df = pd.json_normalize(products)
# Konverter datoer
date_columns = ['created_at', 'updated_at']
for col in date_columns:
if col in df.columns:
df[col] = pd.to_datetime(df[col])
return df
# Hent alle produkter
products_df = get_products_df()
print(f"Hentet {len(products_df)} produkter")
print(products_df.head())
Prisdata
python
def get_prices_df(product_id: int = None, store_id: int = None,
date_from: str = None, date_to: str = None) -> pd.DataFrame:
"""Hent prisdata som DataFrame"""
params = {}
if product_id:
params['product_id'] = product_id
if store_id:
params['store_id'] = store_id
if date_from:
params['date_from'] = date_from
if date_to:
params['date_to'] = date_to
prices = api.get_paginated('prices', params)
df = pd.json_normalize(prices)
if not df.empty:
# Konverter kolonner
df['price'] = pd.to_numeric(df['price'])
df['date'] = pd.to_datetime(df['date'])
df['created_at'] = pd.to_datetime(df['created_at'])
# Sorter etter dato
df = df.sort_values('date')
return df
# Eksempel: Hent prisdata for siste 30 dager
date_from = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
prices_df = get_prices_df(date_from=date_from)
print(f"Hentet {len(prices_df)} priser")
Butikkdata
python
def get_stores_df() -> pd.DataFrame:
"""Hent butikkdata som DataFrame"""
stores = api.get_paginated('stores')
df = pd.json_normalize(stores)
if not df.empty:
# Konverter koordinater
if 'latitude' in df.columns:
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
if 'longitude' in df.columns:
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
return df
stores_df = get_stores_df()
print(f"Hentet {len(stores_df)} butikker")
Kombinerte datasett
python
def get_complete_dataset(days_back: int = 30) -> pd.DataFrame:
"""Hent komplett datasett med produkter, priser og butikker"""
# Hent alle datasett
products = get_products_df()
stores = get_stores_df()
date_from = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
prices = get_prices_df(date_from=date_from)
if prices.empty:
print("Ingen prisdata funnet")
return pd.DataFrame()
# Koble sammen datasett
df = prices.merge(products, left_on='product_id', right_on='id',
how='left', suffixes=('', '_product'))
df = df.merge(stores, left_on='store_id', right_on='id',
how='left', suffixes=('', '_store'))
# Rens kolonner
df = df.drop(columns=[col for col in df.columns if col.endswith('_product') or col.endswith('_store')])
return df
# Hent komplett datasett
complete_df = get_complete_dataset(days_back=90)
print(f"Komplett datasett: {len(complete_df)} rader, {len(complete_df.columns)} kolonner")
Databehandling
DataFrame Operasjoner
python
class DataProcessor:
def __init__(self, df: pd.DataFrame):
self.df = df.copy()
def clean_data(self) -> 'DataProcessor':
"""Grunnleggende datarensing"""
# Fjern duplikater
self.df = self.df.drop_duplicates()
# Håndter manglende verdier
numeric_columns = self.df.select_dtypes(include=[np.number]).columns
self.df[numeric_columns] = self.df[numeric_columns].fillna(0)
# Fjern outliers (IQR metode)
if 'price' in self.df.columns:
Q1 = self.df['price'].quantile(0.25)
Q3 = self.df['price'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
self.df = self.df[(self.df['price'] >= lower) & (self.df['price'] <= upper)]
return self
def add_price_features(self) -> 'DataProcessor':
"""Legg til prisrelaterte features"""
if 'price' not in self.df.columns:
return self
# Pris per kilo/liter hvis weight/volume finnes
if 'weight' in self.df.columns:
self.df['price_per_kg'] = self.df['price'] / (self.df['weight'] / 1000)
if 'volume' in self.df.columns:
self.df['price_per_liter'] = self.df['price'] / (self.df['volume'] / 1000)
# Prisklasser
self.df['price_category'] = pd.cut(self.df['price'],
bins=[0, 20, 50, 100, float('inf')],
labels=['Lav', 'Medium', 'Høy', 'Meget høy'])
return self
def add_time_features(self, date_column: str = 'date') -> 'DataProcessor':
"""Legg til tidsrelaterte features"""
if date_column not in self.df.columns:
return self
self.df['year'] = self.df[date_column].dt.year
self.df['month'] = self.df[date_column].dt.month
self.df['week'] = self.df[date_column].dt.isocalendar().week
self.df['day_of_week'] = self.df[date_column].dt.dayofweek
self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6])
# Sesong
def get_season(month):
if month in [12, 1, 2]:
return 'Vinter'
elif month in [3, 4, 5]:
return 'Vår'
elif month in [6, 7, 8]:
return 'Sommer'
else:
return 'Høst'
self.df['season'] = self.df['month'].apply(get_season)
return self
def get_df(self) -> pd.DataFrame:
"""Returner prosessert DataFrame"""
return self.df
# Bruk DataProcessor
processor = DataProcessor(complete_df)
processed_df = (processor
.clean_data()
.add_price_features()
.add_time_features()
.get_df())
print("Prosessert data:")
print(processed_df.info())
Agregering og Gruppering
python
def create_summary_stats(df: pd.DataFrame) -> dict:
"""Lag sammendragsstatistikk"""
stats = {}
if 'price' in df.columns:
stats['price_stats'] = {
'mean': df['price'].mean(),
'median': df['price'].median(),
'std': df['price'].std(),
'min': df['price'].min(),
'max': df['price'].max(),
'count': len(df)
}
if 'store_name' in df.columns:
stats['store_stats'] = df['store_name'].value_counts().to_dict()
if 'category_name' in df.columns:
stats['category_stats'] = df['category_name'].value_counts().to_dict()
return stats
# Lag sammendrag
summary = create_summary_stats(processed_df)
print("Prisstatistikk:", summary.get('price_stats', {}))
Prisanalyse
Prisutvikling over tid
python
def analyze_price_trends(df: pd.DataFrame, product_name: str = None) -> pd.DataFrame:
"""Analyser prisutvikling"""
if product_name:
df = df[df['name'].str.contains(product_name, case=False, na=False)]
# Grupperinger per dag
daily_prices = df.groupby('date').agg({
'price': ['mean', 'median', 'min', 'max', 'count']
}).reset_index()
daily_prices.columns = ['date', 'avg_price', 'median_price', 'min_price', 'max_price', 'count']
# Beregn endringer
daily_prices['price_change'] = daily_prices['avg_price'].pct_change()
daily_prices['price_change_abs'] = daily_prices['avg_price'].diff()
# Glattede trends (7-dagers moving average)
daily_prices['avg_price_ma7'] = daily_prices['avg_price'].rolling(window=7).mean()
return daily_prices
# Analyser melkpriser
milk_trends = analyze_price_trends(processed_df, "melk")
print("Melkpris trends:")
print(milk_trends.tail())
Butikksammenligning
python
def compare_stores(df: pd.DataFrame, category: str = None) -> pd.DataFrame:
"""Sammenlign priser mellom butikker"""
if category:
df = df[df['category_name'].str.contains(category, case=False, na=False)]
store_comparison = df.groupby(['store_name', 'store_chain']).agg({
'price': ['mean', 'median', 'count'],
'name': 'nunique' # Antall unike produkter
}).reset_index()
store_comparison.columns = ['store_name', 'store_chain', 'avg_price',
'median_price', 'price_count', 'unique_products']
# Beregn prisindeks (relativ til billigste butikk)
min_price = store_comparison['avg_price'].min()
store_comparison['price_index'] = (store_comparison['avg_price'] / min_price) * 100
# Sorter etter gjennomsnittspris
store_comparison = store_comparison.sort_values('avg_price')
return store_comparison
# Sammenlign butikker
store_comparison = compare_stores(processed_df)
print("Butikksammenligning:")
print(store_comparison.head(10))
Sesonganalyse
python
def seasonal_analysis(df: pd.DataFrame) -> pd.DataFrame:
"""Analyser sesongvariasjoner"""
seasonal_stats = df.groupby(['season', 'month']).agg({
'price': ['mean', 'std', 'count']
}).reset_index()
seasonal_stats.columns = ['season', 'month', 'avg_price', 'price_std', 'count']
# Beregn sesongindekser
overall_avg = df['price'].mean()
seasonal_stats['seasonal_index'] = (seasonal_stats['avg_price'] / overall_avg) * 100
return seasonal_stats
seasonal_data = seasonal_analysis(processed_df)
print("Sesonganalyse:")
print(seasonal_data)
Visualisering
Prisutvikling grafer
python
def plot_price_trends(df: pd.DataFrame, title: str = "Prisutvikling"):
"""Plot prisutvikling over tid"""
fig = make_subplots(
rows=2, cols=1,
subplot_titles=('Prisutvikling', 'Prisendringer (%)'),
vertical_spacing=0.1
)
# Hovedpris plot
fig.add_trace(
go.Scatter(x=df['date'], y=df['avg_price'],
name='Gjennomsnitt', line=dict(color='blue')),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=df['date'], y=df['avg_price_ma7'],
name='7-dagers glidende snitt', line=dict(color='red')),
row=1, col=1
)
# Prisendringer
colors = ['green' if x >= 0 else 'red' for x in df['price_change']]
fig.add_trace(
go.Bar(x=df['date'], y=df['price_change'] * 100,
name='Prisendring (%)', marker_color=colors),
row=2, col=1
)
fig.update_layout(
title=title,
height=600,
showlegend=True
)
fig.update_xaxes(title_text="Dato")
fig.update_yaxes(title_text="Pris (NOK)", row=1, col=1)
fig.update_yaxes(title_text="Endring (%)", row=2, col=1)
return fig
# Plot melkpris trends
if not milk_trends.empty:
fig = plot_price_trends(milk_trends, "Melkpris Utvikling")
fig.show()
Butikksammenligning visualisering
python
def plot_store_comparison(df: pd.DataFrame, top_n: int = 15):
"""Visualiser butikksammenligning"""
top_stores = df.head(top_n)
fig = make_subplots(
rows=1, cols=2,
subplot_titles=('Gjennomsnittspris per butikk', 'Prisindeks'),
specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)
# Gjennomsnittspris
fig.add_trace(
go.Bar(x=top_stores['avg_price'],
y=top_stores['store_name'],
orientation='h',
name='Gjennomsnittspris',
marker_color='lightblue'),
row=1, col=1
)
# Prisindeks
colors = ['red' if x > 100 else 'green' for x in top_stores['price_index']]
fig.add_trace(
go.Bar(x=top_stores['price_index'],
y=top_stores['store_name'],
orientation='h',
name='Prisindeks',
marker_color=colors),
row=1, col=2
)
fig.update_layout(
title="Butikksammenligning - Prisanalyse",
height=600,
showlegend=False
)
fig.update_xaxes(title_text="Pris (NOK)", row=1, col=1)
fig.update_xaxes(title_text="Prisindeks (100 = billigst)", row=1, col=2)
return fig
# Plot butikksammenligning
if not store_comparison.empty:
fig = plot_store_comparison(store_comparison)
fig.show()
Interaktive dashboards
python
def create_interactive_dashboard(df: pd.DataFrame):
"""Lag interaktivt dashboard"""
# Prisfordeling histogram
fig1 = px.histogram(df, x='price', nbins=50,
title='Prisfordeling',
labels={'price': 'Pris (NOK)', 'count': 'Antall'})
# Pris per kategori boxplot
fig2 = px.box(df, x='category_name', y='price',
title='Prisfordeling per kategori')
fig2.update_xaxes(tickangle=45)
# Prisutvikling scatter
daily_avg = df.groupby('date')['price'].mean().reset_index()
fig3 = px.scatter(daily_avg, x='date', y='price',
title='Gjennomsnittspris over tid',
trendline='lowess')
# Vis alle grafer
fig1.show()
fig2.show()
fig3.show()
# Lag dashboard
create_interactive_dashboard(processed_df)
Butikksammenligning
Geografisk analyse
python
def geographic_analysis(df: pd.DataFrame) -> pd.DataFrame:
"""Analyser prisvariasjoner geografisk"""
if 'latitude' not in df.columns or 'longitude' not in df.columns:
print("Mangler geografiske koordinater")
return pd.DataFrame()
# Grupperinger per butikk med koordinater
geo_analysis = df.groupby(['store_name', 'latitude', 'longitude']).agg({
'price': ['mean', 'count']
}).reset_index()
geo_analysis.columns = ['store_name', 'latitude', 'longitude', 'avg_price', 'count']
# Fjern butikker med få priser
geo_analysis = geo_analysis[geo_analysis['count'] >= 10]
return geo_analysis
def plot_geographic_prices(geo_df: pd.DataFrame):
"""Plot geografisk prisfordeling"""
fig = px.scatter_mapbox(
geo_df,
lat="latitude",
lon="longitude",
color="avg_price",
size="count",
hover_data=['store_name', 'avg_price', 'count'],
color_continuous_scale="RdYlBu_r",
title="Geografisk Prisfordeling",
mapbox_style="open-street-map",
height=600
)
fig.update_layout(
mapbox=dict(
center=dict(lat=60.0, lon=10.0), # Norge sentrum
zoom=5
)
)
return fig
# Geografisk analyse
geo_data = geographic_analysis(processed_df)
if not geo_data.empty:
fig = plot_geographic_prices(geo_data)
fig.show()
Konkurranse analyse
python
def competition_analysis(df: pd.DataFrame) -> dict:
"""Analyser konkurranse mellom butikkjeder"""
chain_stats = df.groupby('store_chain').agg({
'price': ['mean', 'std', 'min', 'max'],
'store_name': 'nunique',
'name': 'nunique'
}).reset_index()
chain_stats.columns = ['store_chain', 'avg_price', 'price_std', 'min_price',
'max_price', 'store_count', 'product_count']
# Beregn markedsandel (basert på antall priser)
total_prices = len(df)
chain_market_share = df['store_chain'].value_counts()
chain_stats['market_share'] = chain_stats['store_chain'].map(
lambda x: (chain_market_share.get(x, 0) / total_prices) * 100
)
# Konkurranseindeks (lavere pris + høy markedsandel = bedre)
chain_stats['competition_score'] = (
(1 / chain_stats['avg_price']) * chain_stats['market_share']
)
return {
'chain_stats': chain_stats.sort_values('avg_price'),
'market_share': chain_market_share
}
competition = competition_analysis(processed_df)
print("Butikkjede analyse:")
print(competition['chain_stats'])
Tidsserieanalyse
Forecasting med Prophet
python
# Installer prophet: pip install prophet
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
def forecast_prices(df: pd.DataFrame, product_name: str, days_ahead: int = 30) -> dict:
"""Lag prisforecast med Prophet"""
# Filtrer data for spesifikt produkt
product_df = df[df['name'].str.contains(product_name, case=False, na=False)]
if product_df.empty:
return {"error": f"Ingen data funnet for {product_name}"}
# Forbered data for Prophet
prophet_df = product_df.groupby('date')['price'].mean().reset_index()
prophet_df.columns = ['ds', 'y'] # Prophet krever disse kolonnenavnene
# Tren modell
model = Prophet(
daily_seasonality=False,
weekly_seasonality=True,
yearly_seasonality=True,
changepoint_prior_scale=0.05
)
model.fit(prophet_df)
# Lag forecast
future = model.make_future_dataframe(periods=days_ahead)
forecast = model.predict(future)
return {
'model': model,
'forecast': forecast,
'historical': prophet_df
}
# Eksempel: Forecast melkpriser
milk_forecast = forecast_prices(processed_df, "melk", days_ahead=60)
if 'error' not in milk_forecast:
# Plot forecast
fig1 = plot_plotly(milk_forecast['model'], milk_forecast['forecast'])
fig1.update_layout(title="Melkpris Forecast")
fig1.show()
# Plot komponenter
fig2 = plot_components_plotly(milk_forecast['model'], milk_forecast['forecast'])
fig2.show()
Sesongdekomposisjon
python
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
def decompose_time_series(df: pd.DataFrame, product_name: str):
"""Dekomponerer tidsserier i trend, sesong og residualer"""
# Filtrer og forbered data
product_df = df[df['name'].str.contains(product_name, case=False, na=False)]
daily_prices = product_df.groupby('date')['price'].mean()
# Resample til ukentlige data for bedre sesongmønster
weekly_prices = daily_prices.resample('W').mean().fillna(method='ffill')
if len(weekly_prices) < 104: # Trenger minst 2 år med ukentlige data
print(f"For lite data for {product_name} (trenger minst 104 uker)")
return None
# Sesongdekomposisjon
decomposition = seasonal_decompose(weekly_prices,
model='additive',
period=52) # 52 uker = 1 år
# Plot
fig, axes = plt.subplots(4, 1, figsize=(15, 12))
decomposition.observed.plot(ax=axes[0], title=f'{product_name} - Original')
decomposition.trend.plot(ax=axes[1], title='Trend')
decomposition.seasonal.plot(ax=axes[2], title='Sesong')
decomposition.resid.plot(ax=axes[3], title='Residualer')
plt.tight_layout()
plt.show()
return decomposition
# Dekomponering av melkpriser
milk_decomp = decompose_time_series(processed_df, "melk")
Maskinlæring
Prisforecast med Machine Learning
python
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
class PricePredictionModel:
def __init__(self):
self.model = None
self.label_encoders = {}
self.feature_columns = []
def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Forbered features for modellering"""
features_df = df.copy()
# Encode kategoriske variabler
categorical_columns = ['store_chain', 'category_name', 'brand_name', 'season']
for col in categorical_columns:
if col in features_df.columns:
if col not in self.label_encoders:
self.label_encoders[col] = LabelEncoder()
features_df[col] = self.label_encoders[col].fit_transform(
features_df[col].astype(str)
)
else:
features_df[col] = self.label_encoders[col].transform(
features_df[col].astype(str)
)
# Velg numeriske features
numeric_features = ['year', 'month', 'week', 'day_of_week', 'is_weekend']
categorical_encoded = [col for col in categorical_columns if col in features_df.columns]
self.feature_columns = numeric_features + categorical_encoded
feature_columns_available = [col for col in self.feature_columns if col in features_df.columns]
return features_df[feature_columns_available]
def train(self, df: pd.DataFrame, target_column: str = 'price'):
"""Tren prisforecasting modell"""
# Forbered features
X = self.prepare_features(df)
y = df[target_column]
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Grid search for beste parametre
param_grid = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
}
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
self.model = grid_search.best_estimator_
# Evaluering
y_pred = self.model.predict(X_test)
metrics = {
'mae': mean_absolute_error(y_test, y_pred),
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
'r2': r2_score(y_test, y_pred),
'best_params': grid_search.best_params_
}
return metrics, X_test, y_test, y_pred
def predict(self, df: pd.DataFrame) -> np.ndarray:
"""Forutsi priser"""
if self.model is None:
raise ValueError("Modell må trenes først")
X = self.prepare_features(df)
return self.model.predict(X)
def feature_importance(self) -> pd.DataFrame:
"""Få feature importance"""
if self.model is None:
return pd.DataFrame()
importance_df = pd.DataFrame({
'feature': self.feature_columns,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
return importance_df
# Tren modell
price_model = PricePredictionModel()
metrics, X_test, y_test, y_pred = price_model.train(processed_df)
print("Modell ytelse:")
print(f"MAE: {metrics['mae']:.2f} NOK")
print(f"RMSE: {metrics['rmse']:.2f} NOK")
print(f"R²: {metrics['r2']:.3f}")
print(f"Best params: {metrics['best_params']}")
# Feature importance
importance = price_model.feature_importance()
print("\nFeature Importance:")
print(importance)
Anomali deteksjon
python
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
def detect_price_anomalies(df: pd.DataFrame) -> pd.DataFrame:
"""Finn unormale priser med Isolation Forest"""
# Forbered data
price_features = df[['price', 'year', 'month', 'day_of_week']].copy()
price_features = price_features.fillna(price_features.mean())
# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
anomalies = iso_forest.fit_predict(price_features)
# Legg til anomali flagg
df_with_anomalies = df.copy()
df_with_anomalies['is_anomaly'] = anomalies == -1
return df_with_anomalies
# Finn anomalier
df_anomalies = detect_price_anomalies(processed_df)
anomalies_found = df_anomalies[df_anomalies['is_anomaly']]
print(f"Fant {len(anomalies_found)} prisanomalier")
print("Eksempler på anomalier:")
print(anomalies_found[['name', 'store_name', 'price', 'date']].head())
Rapporter
Automatiserte rapporter
python
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
class PriceReport:
def __init__(self, df: pd.DataFrame):
self.df = df
self.report_date = datetime.now().strftime('%Y-%m-%d')
def generate_summary_stats(self) -> dict:
"""Generer sammendragsstatistikk"""
return {
'total_products': self.df['name'].nunique(),
'total_prices': len(self.df),
'avg_price': self.df['price'].mean(),
'median_price': self.df['price'].median(),
'price_range': self.df['price'].max() - self.df['price'].min(),
'stores_count': self.df['store_name'].nunique(),
'categories_count': self.df['category_name'].nunique(),
'date_range': {
'from': self.df['date'].min().strftime('%Y-%m-%d'),
'to': self.df['date'].max().strftime('%Y-%m-%d')
}
}
def generate_markdown_report(self) -> str:
"""Generer Markdown rapport"""
stats = self.generate_summary_stats()
markdown_report = f"""
# Kassalapp Prisrapport
**Generert:** {self.report_date}
## Sammendrag
- **Antall produkter:** {stats['total_products']:,}
- **Antall priser:** {stats['total_prices']:,}
- **Gjennomsnittspris:** {stats['avg_price']:.2f} NOK
- **Medianpris:** {stats['median_price']:.2f} NOK
- **Prisområde:** {stats['price_range']:.2f} NOK
- **Antall butikker:** {stats['stores_count']}
- **Antall kategorier:** {stats['categories_count']}
- **Dataperiode:** {stats['date_range']['from']} til {stats['date_range']['to']}
## Top 10 Dyreste Produkter
"""
# Top dyre produkter
expensive_products = self.df.nlargest(10, 'price')[['name', 'store_name', 'price']]
for _, row in expensive_products.iterrows():
markdown_report += f"- **{row['name']}** - {row['store_name']} - {row['price']:.2f} NOK\n"
markdown_report += "\n## Top 10 Billigste Butikker (gjennomsnitt)\n"
# Billigste butikker
cheap_stores = self.df.groupby('store_name')['price'].mean().nsmallest(10)
for store, price in cheap_stores.items():
markdown_report += f"- **{store}** - {price:.2f} NOK\n"
return markdown_report
def save_report(self, filename: str = None):
"""Lagre rapport til fil"""
if filename is None:
filename = f"kassalapp_rapport_{self.report_date}.md"
report_content = self.generate_markdown_report()
with open(filename, 'w', encoding='utf-8') as f:
f.write(report_content)
print(f"Rapport lagret som {filename}")
return filename
# Generer rapport
reporter = PriceReport(processed_df)
report_file = reporter.save_report()
print(reporter.generate_markdown_report())
Excel rapporter
python
def create_excel_report(df: pd.DataFrame, filename: str = None):
"""Lag Excel rapport med multiple sheets"""
if filename is None:
filename = f"kassalapp_rapport_{datetime.now().strftime('%Y%m%d')}.xlsx"
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# Summary sheet
summary_stats = {
'Metrikk': ['Antall produkter', 'Antall priser', 'Gjennomsnittspris',
'Medianpris', 'Antall butikker'],
'Verdi': [df['name'].nunique(), len(df), df['price'].mean(),
df['price'].median(), df['store_name'].nunique()]
}
pd.DataFrame(summary_stats).to_excel(writer, sheet_name='Sammendrag', index=False)
# Top produkter
expensive = df.nlargest(50, 'price')[['name', 'store_name', 'price', 'date']]
expensive.to_excel(writer, sheet_name='Dyreste_Produkter', index=False)
# Butikksammenligning
store_stats = df.groupby('store_name').agg({
'price': ['mean', 'median', 'count'],
'name': 'nunique'
}).reset_index()
store_stats.columns = ['Butikk', 'Gj.snitt', 'Median', 'Antall_Priser', 'Unike_Produkter']
store_stats.to_excel(writer, sheet_name='Butikker', index=False)
# Kategorier
category_stats = df.groupby('category_name').agg({
'price': ['mean', 'count']
}).reset_index()
category_stats.columns = ['Kategori', 'Gj.snitt_Pris', 'Antall']
category_stats.to_excel(writer, sheet_name='Kategorier', index=False)
print(f"Excel rapport lagret som {filename}")
# Lag Excel rapport
create_excel_report(processed_df)
Dashboard
Streamlit Dashboard
python
# Lagre som streamlit_app.py og kjør: streamlit run streamlit_app.py
import streamlit as st
import pandas as pd
import plotly.express as px
def create_streamlit_dashboard():
st.set_page_config(page_title="Kassalapp Analytics", layout="wide")
st.title("🛒 Kassalapp Prisanalyse Dashboard")
st.sidebar.title("Navigasjon")
# Last inn data (i produksjon, bruk caching)
@st.cache_data
def load_data():
# Her ville du lastet data fra API
return processed_df
df = load_data()
# Sidebar filtre
st.sidebar.header("Filtre")
selected_stores = st.sidebar.multiselect(
"Velg butikker:",
options=df['store_name'].unique(),
default=df['store_name'].unique()[:5]
)
selected_categories = st.sidebar.multiselect(
"Velg kategorier:",
options=df['category_name'].unique(),
default=df['category_name'].unique()[:3]
)
# Filtrer data
filtered_df = df[
(df['store_name'].isin(selected_stores)) &
(df['category_name'].isin(selected_categories))
]
# Hovedinnhold
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Antall produkter", filtered_df['name'].nunique())
with col2:
st.metric("Gjennomsnittspris", f"{filtered_df['price'].mean():.2f} NOK")
with col3:
st.metric("Antall butikker", filtered_df['store_name'].nunique())
with col4:
st.metric("Totalt antall priser", len(filtered_df))
# Visualiseringer
col1, col2 = st.columns(2)
with col1:
st.subheader("Prisfordeling")
fig_hist = px.histogram(filtered_df, x='price', nbins=30)
st.plotly_chart(fig_hist, use_container_width=True)
with col2:
st.subheader("Gjennomsnittspris per butikk")
store_avg = filtered_df.groupby('store_name')['price'].mean().sort_values()
fig_bar = px.bar(x=store_avg.values, y=store_avg.index, orientation='h')
st.plotly_chart(fig_bar, use_container_width=True)
# Tidsserier
st.subheader("Prisutvikling over tid")
daily_avg = filtered_df.groupby('date')['price'].mean().reset_index()
fig_line = px.line(daily_avg, x='date', y='price')
st.plotly_chart(fig_line, use_container_width=True)
# Datatabell
st.subheader("Rå data")
st.dataframe(filtered_df.head(1000))
# For å kjøre dashboard:
# streamlit run dashboard.py
Dash Dashboard
python
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
def create_dash_dashboard():
app = dash.Dash(__name__)
app.layout = html.Div([
html.H1("Kassalapp Analytics Dashboard",
style={'text-align': 'center'}),
html.Div([
html.Div([
html.Label("Velg butikkjeder:"),
dcc.Dropdown(
id='store-dropdown',
options=[{'label': store, 'value': store}
for store in processed_df['store_chain'].unique()],
value=processed_df['store_chain'].unique()[:3],
multi=True
)
], className='six columns'),
html.Div([
html.Label("Velg tidsperiode:"),
dcc.DatePickerRange(
id='date-picker',
start_date=processed_df['date'].min(),
end_date=processed_df['date'].max()
)
], className='six columns'),
], className='row'),
html.Div([
dcc.Graph(id='price-trend-graph')
]),
html.Div([
html.Div([
dcc.Graph(id='store-comparison')
], className='six columns'),
html.Div([
dcc.Graph(id='category-analysis')
], className='six columns'),
], className='row')
])
@app.callback(
[Output('price-trend-graph', 'figure'),
Output('store-comparison', 'figure'),
Output('category-analysis', 'figure')],
[Input('store-dropdown', 'value'),
Input('date-picker', 'start_date'),
Input('date-picker', 'end_date')]
)
def update_graphs(selected_stores, start_date, end_date):
filtered_df = processed_df[
(processed_df['store_chain'].isin(selected_stores)) &
(processed_df['date'] >= start_date) &
(processed_df['date'] <= end_date)
]
# Prisutvikling
daily_prices = filtered_df.groupby('date')['price'].mean().reset_index()
trend_fig = px.line(daily_prices, x='date', y='price',
title='Prisutvikling over tid')
# Butikksammenligning
store_avg = filtered_df.groupby('store_name')['price'].mean().sort_values()
store_fig = px.bar(x=store_avg.index, y=store_avg.values,
title='Gjennomsnittspris per butikk')
# Kategorianalyse
cat_avg = filtered_df.groupby('category_name')['price'].mean().sort_values()
cat_fig = px.bar(x=cat_avg.values, y=cat_avg.index, orientation='h',
title='Gjennomsnittspris per kategori')
return trend_fig, store_fig, cat_fig
return app
# Kjør dashboard
# app = create_dash_dashboard()
# app.run_server(debug=True)
Praktiske eksempler
Case 1: Melkpris analyse
python
def melkpris_case_study(df: pd.DataFrame):
"""Komplett case study av melkpriser"""
print("=== MELKPRIS ANALYSE ===\n")
# 1. Filtrer melkprodukter
melk_df = df[df['name'].str.contains('melk', case=False, na=False)]
print(f"Fant {len(melk_df)} melkpriser")
# 2. Grunnleggende statistikk
print(f"Gjennomsnittspris: {melk_df['price'].mean():.2f} NOK")
print(f"Medianpris: {melk_df['price'].median():.2f} NOK")
print(f"Prisområde: {melk_df['price'].min():.2f} - {melk_df['price'].max():.2f} NOK\n")
# 3. Beste tilbud
print("=== BESTE MELKTILBUD ===")
best_deals = melk_df.nsmallest(10, 'price')[['name', 'store_name', 'price']]
for _, row in best_deals.iterrows():
print(f"{row['name'][:30]:30} - {row['store_name']:15} - {row['price']:6.2f} NOK")
# 4. Butikksammenligning
print("\n=== BUTIKKSAMMENLIGNING MELK ===")
store_melk = melk_df.groupby('store_name')['price'].agg(['mean', 'count']).sort_values('mean')
store_melk = store_melk[store_melk['count'] >= 5] # Kun butikker med minst 5 melkepriser
for store, data in store_melk.head(10).iterrows():
print(f"{store:20} - Gj.snitt: {data['mean']:6.2f} NOK ({data['count']} produkter)")
# 5. Tidsutvikling
print("\n=== PRISUTVIKLING ===")
daily_melk = melk_df.groupby('date')['price'].mean()
weekly_change = daily_melk.pct_change(periods=7).iloc[-1] * 100
monthly_change = daily_melk.pct_change(periods=30).iloc[-1] * 100
print(f"Endring siste uke: {weekly_change:+.1f}%")
print(f"Endring siste måned: {monthly_change:+.1f}%")
return melk_df
melk_analysis = melkpris_case_study(processed_df)
Case 2: Sesongvariasjoner i frukt/grønt
python
def seasonal_produce_analysis(df: pd.DataFrame):
"""Analyser sesongvariasjoner i frukt og grønnsaker"""
# Frukt og grønt kategorier
produce_categories = ['Frukt', 'Grønnsaker', 'Poteter', 'Bær']
produce_df = df[df['category_name'].isin(produce_categories)]
if produce_df.empty:
print("Ingen frukt/grønt data funnet")
return
print("=== SESONGANALYSE FRUKT/GRØNT ===\n")
# Månedlig prisutvikling per kategori
monthly_prices = produce_df.groupby(['month', 'category_name'])['price'].mean().unstack()
# Finn mest sesongavhengige produkter
seasonal_variation = {}
for category in produce_categories:
if category in monthly_prices.columns:
prices = monthly_prices[category].dropna()
if len(prices) > 0:
cv = prices.std() / prices.mean() # Coefficient of variation
seasonal_variation[category] = cv
print("Sesongvariasjon (høyere = mer sesongavhengig):")
for category, variation in sorted(seasonal_variation.items(),
key=lambda x: x[1], reverse=True):
print(f"{category:15} - CV: {variation:.3f}")
# Plot sesongmønster
if not monthly_prices.empty:
fig = px.line(monthly_prices.reset_index(),
x='month', y=monthly_prices.columns,
title='Sesongvariasjoner Frukt/Grønt')
fig.show()
seasonal_produce_analysis(processed_df)
Case 3: Butikkjede loyalitetsprogramanalyse
python
def loyalty_program_analysis(df: pd.DataFrame):
"""Analyser effekt av loyalitetsprogram på priser"""
# Definere butikker med kjente loyalitetsprogrammer
loyalty_stores = {
'REMA 1000': 'Æ',
'Coop': 'Coop Medlemspris',
'Kiwi': 'Kiwi Pluss',
'Meny': 'Meny Pluss'
}
loyalty_analysis = {}
for store_chain, program in loyalty_stores.items():
chain_df = df[df['store_chain'] == store_chain]
if chain_df.empty:
continue
# Sammenlign med andre butikker
other_stores = df[df['store_chain'] != store_chain]
if not other_stores.empty:
chain_avg = chain_df['price'].mean()
others_avg = other_stores['price'].mean()
price_difference = ((chain_avg - others_avg) / others_avg) * 100
loyalty_analysis[store_chain] = {
'program': program,
'avg_price': chain_avg,
'price_vs_competitors': price_difference,
'product_count': len(chain_df)
}
print("=== LOYALITETSPROGRAM ANALYSE ===\n")
for chain, data in loyalty_analysis.items():
print(f"{chain} ({data['program']}):")
print(f" Gjennomsnittspris: {data['avg_price']:.2f} NOK")
print(f" vs Konkurrenter: {data['price_vs_competitors']:+.1f}%")
print(f" Antall produkter: {data['product_count']}")
print()
loyalty_program_analysis(processed_df)
Case 4: Inflasjonsanalyse
python
def inflation_analysis(df: pd.DataFrame):
"""Analyser matvareinflasjon basert på prisdata"""
# Beregn månedlig inflasjon
monthly_avg = df.groupby(df['date'].dt.to_period('M'))['price'].mean()
monthly_inflation = monthly_avg.pct_change() * 100
# Årlig inflasjon (12 måneders rullerende)
annual_inflation = monthly_avg.pct_change(periods=12) * 100
print("=== MATVAREINFLASJON ANALYSE ===\n")
print("Siste 6 måneders inflasjon:")
for period, inflation in monthly_inflation.tail(6).items():
print(f"{period}: {inflation:+.1f}%")
print(f"\nÅrlig inflasjon (siste 12 mnd): {annual_inflation.iloc[-1]:+.1f}%")
# Kategorivis inflasjon
category_inflation = df.groupby(['category_name', df['date'].dt.to_period('M')])['price'].mean()
category_inflation = category_inflation.groupby('category_name').pct_change(periods=12).iloc[-1] * 100
print("\nInflasjon per kategori (årlig):")
for category, inflation in category_inflation.sort_values(ascending=False).head(10).items():
print(f"{category:20}: {inflation:+.1f}%")
inflation_analysis(processed_df)
Case 5: Konkurranseanalyse
python
def competition_market_analysis(df: pd.DataFrame):
"""Detaljert konkurranseanalyse av matvaremarkedet"""
print("=== KONKURRANSEANALYSE ===\n")
# 1. Markedsandeler (basert på antall produktlistinger)
market_share = df['store_chain'].value_counts()
total_listings = len(df)
print("Markedsandeler (basert på produktlistinger):")
for chain, count in market_share.head(10).items():
percentage = (count / total_listings) * 100
print(f"{chain:15}: {percentage:5.1f}% ({count:,} produkter)")
# 2. Prisposisjonering
print("\n=== PRISPOSISJONERING ===")
chain_pricing = df.groupby('store_chain')['price'].agg(['mean', 'median', 'std']).sort_values('mean')
for chain, data in chain_pricing.head(10).iterrows():
print(f"{chain:15}: Gj.snitt {data['mean']:6.2f} NOK, Median {data['median']:6.2f} NOK")
# 3. Priskonkurranse intensitet (prisspredning)
print("\n=== KONKURRANSEINTENSITET ===")
# Høy standardavvik = høy konkurranse
competition_intensity = chain_pricing['std'].sort_values(ascending=False)
for chain, std in competition_intensity.head(5).items():
avg_price = chain_pricing.loc[chain, 'mean']
cv = std / avg_price # Coefficient of variation
print(f"{chain:15}: Prisspredning {std:5.2f} NOK (CV: {cv:.2f})")
# 4. Premium vs Budget posisjonering
overall_median = df['price'].median()
premium_chains = chain_pricing[chain_pricing['median'] > overall_median * 1.1]
budget_chains = chain_pricing[chain_pricing['median'] < overall_median * 0.9]
print(f"\nPremium butikker (>10% over median {overall_median:.2f} NOK):")
for chain in premium_chains.index[:5]:
print(f" {chain}")
print(f"\nBudget butikker (<10% under median):")
for chain in budget_chains.index[:5]:
print(f" {chain}")
competition_market_analysis(processed_df)
Avanserte analyseteknikker
Priskorrelasjon og markedsegmenter
python
def market_segmentation_analysis(df: pd.DataFrame):
"""Avansert markedssegmenteringsanalyse"""
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Forbered data for clustering
store_features = df.groupby('store_name').agg({
'price': ['mean', 'std', 'min', 'max'],
'name': 'nunique',
'category_name': 'nunique'
}).reset_index()
store_features.columns = ['store_name', 'avg_price', 'price_std', 'min_price',
'max_price', 'product_count', 'category_count']
# Standardiser features
features = ['avg_price', 'price_std', 'product_count', 'category_count']
X = store_features[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
store_features['segment'] = kmeans.fit_predict(X_scaled)
# Analyser segmenter
segment_analysis = store_features.groupby('segment').agg({
'avg_price': ['mean', 'std'],
'product_count': 'mean',
'category_count': 'mean',
'store_name': 'count'
}).round(2)
print("=== MARKEDSSEGMENTER ===")
print(segment_analysis)
# Plot segmenter
fig = px.scatter(store_features,
x='avg_price',
y='product_count',
color='segment',
hover_data=['store_name'],
title='Butikksegmentering')
fig.show()
return store_features
segmentation = market_segmentation_analysis(processed_df)
Dette dokumentet gir et komplett rammeverk for dataanalyse av norske matvarepriser ved bruk av Kassalapp API. Hver seksjon inneholder praktiske eksempler som kan tilpasses spesifikke analysebehov.
For å komme i gang:
- Installer nødvendige pakker
- Få API-tilgang fra Kassalapp
- Kjør eksempelkoden i Jupyter Notebook
- Tilpass analysene til dine behov
Med disse verktøyene kan du utføre alt fra enkle prissammenligninger til avanserte forecasting-modeller og markedsanalyser.