import numpy as np # Import von NumPy
import pandas as pd # Import von Pandas
import plotly.express as px # Import von Plotly

xy = pd.DataFrame({"x":[20,80], "y":[2,3]})
px.scatter(xy, x="x", y="y")

beta_1 = (xy.y[1] - xy.y[0]) / (xy.x[1] - xy.x[0])
beta_0 = xy.y[0] - beta_1 * xy.x[0]
beta_0, beta_1

(1.6666666666666667, 0.016666666666666666)

xy = pd.DataFrame({"x":range(100)})
xy["y"]= beta_0 + beta_1 * xy.x
xy["type"] = np.where((20 <= xy.x) & (xy.x <= 80), "Interpolation", "Extrapolation")

px.line(xy, x="x", y="y", color="type", markers=True)

egywth = pd.read_csv("../data/UROS/Energy1D_weather_clean.csv", parse_dates=[0])
egywth.head()

fig = px.scatter(egywth, x="TMK", y="ES_Lab", opacity=.5)
fig

beta_1 = ((egywth.TMK - egywth.TMK.mean())*(egywth.ES_Lab - egywth.ES_Lab.mean())).sum()
beta_1 /= (egywth.TMK - egywth.TMK.mean()).pow(2).sum()

beta_0 = egywth.ES_Lab.mean() - beta_1 * egywth.TMK.mean()

beta_0, beta_1

(7.95973736889691, 3.1496156006907596)

# Wir nehmen noch einmal die TMK Werte und sortieren sie
pred_x = np.sort(egywth.TMK.values)

pred_y = beta_0 + beta_1 * pred_x

fig.add_scatter(x=pred_x, y=pred_y, name="manual")
fig

px.scatter(egywth, x="TMK", y="ES_Lab", opacity=.5, trendline="ols")

from sklearn.linear_model import LinearRegression

sklm = LinearRegression() # Das legt nur eine Instanz der Modellklasse an

X = egywth.TMK.values.reshape(-1, 1) # Wir erstellen eine Matrix der Eingangsdaten
Y = egywth.ES_Lab.values             # Wir erstellen einen Vektor der Zieldaten
sklm.fit(X, Y)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 3
      1 X = egywth.TMK.values.reshape(-1, 1) # Wir erstellen eine Matrix der Eingangsdaten
      2 Y = egywth.ES_Lab.values             # Wir erstellen einen Vektor der Zieldaten
----> 3 sklm.fit(X, Y)

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/base.py:1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1382     estimator._validate_params()
   1384 with config_context(
   1385     skip_parameter_validation=(
   1386         prefer_skip_nested_validation or global_skip_validation
   1387     )
   1388 ):
-> 1389     return fit_method(estimator, *args, **kwargs)

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/linear_model/_base.py:601, in LinearRegression.fit(self, X, y, sample_weight)
    597 n_jobs_ = self.n_jobs
    599 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 601 X, y = validate_data(
    602     self,
    603     X,
    604     y,
    605     accept_sparse=accept_sparse,
    606     y_numeric=True,
    607     multi_output=True,
    608     force_writeable=True,
    609 )
    611 has_sw = sample_weight is not None
    612 if has_sw:

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:2961, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
   2959         y = check_array(y, input_name="y", **check_y_params)
   2960     else:
-> 2961         X, y = check_X_y(X, y, **check_params)
   2962     out = X, y
   2964 if not no_val_X and check_params.get("ensure_2d", True):

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:1387, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1368 ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
   1370 X = check_array(
   1371     X,
   1372     accept_sparse=accept_sparse,
   (...)
   1384     input_name="X",
   1385 )
-> 1387 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1389 check_consistent_length(X, y)
   1391 return X, y

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:1397, in _check_y(y, multi_output, y_numeric, estimator)
   1395 """Isolated part of check_X_y dedicated to y validation"""
   1396 if multi_output:
-> 1397     y = check_array(
   1398         y,
   1399         accept_sparse="csr",
   1400         ensure_all_finite=True,
   1401         ensure_2d=False,
   1402         dtype=None,
   1403         input_name="y",
   1404         estimator=estimator,
   1405     )
   1406 else:
   1407     estimator_name = _check_estimator_name(estimator)

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:1107, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1101     raise ValueError(
   1102         "Found array with dim %d. %s expected <= 2."
   1103         % (array.ndim, estimator_name)
   1104     )
   1106 if ensure_all_finite:
-> 1107     _assert_all_finite(
   1108         array,
   1109         input_name=input_name,
   1110         estimator_name=estimator_name,
   1111         allow_nan=ensure_all_finite == "allow-nan",
   1112     )
   1114 if copy:
   1115     if _is_numpy_namespace(xp):
   1116         # only make a copy if `array` and `array_orig` may share memory`

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:120, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    117 if first_pass_isfinite:
    118     return
--> 120 _assert_all_finite_element_wise(
    121     X,
    122     xp=xp,
    123     allow_nan=allow_nan,
    124     msg_dtype=msg_dtype,
    125     estimator_name=estimator_name,
    126     input_name=input_name,
    127 )

File /opt/miniconda3/envs/lehre4/lib/python3.12/site-packages/sklearn/utils/validation.py:169, in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name)
    152 if estimator_name and input_name == "X" and has_nan_error:
    153     # Improve the error message on how to handle missing values in
    154     # scikit-learn.
    155     msg_err += (
    156         f"\n{estimator_name} does not accept missing values"
    157         " encoded as NaN natively. For supervised learning, you might want"
   (...)
    167         "#estimators-that-handle-nan-values"
    168     )
--> 169 raise ValueError(msg_err)

ValueError: Input y contains NaN.

egywthNoNA=egywth[["TMK", "ES_Lab"]].dropna().sort_values("TMK") # Drop NA rows

# Modell trainieren
X = egywthNoNA.TMK.values.reshape(-1, 1)
Y = egywthNoNA.ES_Lab.values
sklm.fit(X, Y)

LinearRegression()

LinearRegression()

sklm.intercept_, sklm.coef_

(7.778394959065672, array([3.18532247]))

egywthNoNA["pred"] = sklm.predict(X)

fig.add_scatter(x=egywthNoNA.TMK, y=egywthNoNA.pred, name="sklearn")
fig

smf.ols(equation, data)

import statsmodels.formula.api as smf

smfols = smf.ols("ES_Lab ~ TMK", egywth)

smflm = smfols.fit() # Der Aufruf gibt das eigentliche Modell zurück

smflm.params

Intercept    7.778395
TMK          3.185322
dtype: float64

smflm.summary()

pred_x_df  = pd.DataFrame({"TMK":pred_x})
pred_y_smf = smflm.predict(pred_x_df)

fig.add_scatter(x=pred_x, y=pred_y_smf, name="statsmodel")
fig

sklm_mv = LinearRegression()

egywthNoNA_MV = egywth[["SDK", "NM", "VPM", "TMK", "ES_Lab"]].dropna().sort_values("TMK") # Drop NA rows

# Modell trainieren
X_MV = egywthNoNA_MV.iloc[:,:-1].values # alles außer letzte Spalte
Y_MV = egywthNoNA_MV.ES_Lab.values
sklm_mv.fit(X_MV, Y_MV)

LinearRegression()

LinearRegression()

egywthNoNA_MV["pred"] = sklm_mv.predict(X_MV)
fig.add_scatter(x=egywthNoNA_MV.TMK, y=egywthNoNA_MV.pred, name="sklearn MV")
fig

smflm_mv = smf.ols("ES_Lab ~ TMK + SDK + NM + VPM", data=egywth).fit()

smflm_mv.params

Intercept   -20.611237
TMK           1.997753
SDK           6.936602
NM            3.690851
VPM          -1.571116
dtype: float64

smflm_mv.summary()

egywthNoNA_MV["pred2"] = smflm_mv.predict(egywthNoNA_MV)
fig.add_scatter(x=egywthNoNA_MV.TMK, y=egywthNoNA_MV.pred2, name="statsmodel MV")
fig

egywth["Weekday"] = egywth["Date"].dt.day_name()
egywth["Weekday"]

0       Wednesday
1        Thursday
2          Friday
3        Saturday
4          Sunday
          ...    
1093     Thursday
1094       Friday
1095     Saturday
1096       Sunday
1097       Monday
Name: Weekday, Length: 1098, dtype: object

smflm_mv_wd = smf.ols("ES_Lab ~ TMK + SDK + NM + VPM + Weekday", data=egywth).fit()

smflm_mv_wd.summary()

egywth["WeekdayN"] = egywth["Date"].dt.dayofweek
egywth["WeekdayN"]

0       2
1       3
2       4
3       5
4       6
       ..
1093    3
1094    4
1095    5
1096    6
1097    0
Name: WeekdayN, Length: 1098, dtype: int32

smflm_mv_wd = smf.ols("ES_Lab ~ TMK + SDK + NM + VPM + C(WeekdayN)", data=egywth).fit()

smflm_mv_wd.summary()

smflm_mv_wd2 = smf.ols("ES_Lab ~ TMK + SDK + NM + VPM + C(WeekdayN)*HeizKuehlTage", data=egywth).fit()

smflm_mv_wd2.summary()

egywth["Pred_UV"]=smflm.predict(egywth)
egywth["Residum_UV"]= egywth.ES_Lab - egywth.Pred_UV

px.histogram(egywth.Residum_UV)

px.line(egywth, x="Date", y=["ES_Lab", "Pred_UV"])

egywth["Pred_MV"]=smflm_mv_wd2.predict(egywth)
egywth["Residum_MV"]= egywth.ES_Lab - egywth.Pred_MV

px.histogram(egywth.Residum_MV)

px.line(egywth, x="Date", y=["ES_Lab", "Pred_MV"])

egywth.Residum_UV.pow(2).mean()

736.8486474358979

egywth.Residum_MV.pow(2).mean()

147.90321019931775

np.sqrt(egywth.Residum_UV.pow(2).mean())

27.144956206188617

np.sqrt(egywth.Residum_MV.pow(2).mean())

12.16154637368611

egywth.Residum_UV.abs().mean()

21.892016248549638

egywth.Residum_MV.abs().mean()

9.110498926081684

egywth.Residum_UV.abs().mean()*100

2189.201624854964

egywth.Residum_MV.abs().mean()*100

911.0498926081684

TSS = (egywth.ES_Lab - egywth.ES_Lab.mean()).pow(2).sum()
SSE = egywth.Residum_UV.pow(2).sum()
R2 = 1 - SSE/TSS
R2

0.38061649401642605

TSS = (egywth.ES_Lab - egywth.ES_Lab.mean()).pow(2).sum()
SSE = egywth.Residum_MV.pow(2).sum()
R2 = 1 - SSE/TSS
R2

0.8806156958265965

smflm.rsquared

0.38061649401642594

smflm_mv.rsquared

0.8715365639979806

sklm.score(X, Y)

0.38061649401642617

sklm_mv.score(X_MV, Y_MV)

0.8715365639979806

	Date	EV_HT_740	EV_NT_740	E_AV_Lab	E_SV_Lab	ES_Lab	DATUM_DT	STATIONS_ID	MESS_DATUM	QN_3	...	TMK	UPM	TXK	TNK	TGK	eor	TemperaturKlasse	HeizKuehlTage	QNS_4	QNF_4
0	2020-12-30 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN	2020-12-30 00:00:00+00:00	4271.0	20201230.0	10.0	...	4.0	81.0	4.6	2.9	2.2	eor	Cold	Heizgradtag	nicht alle Parameter korrigiert	5
1	2020-12-31 00:00:00+00:00	NaN	NaN	1256.0	291.0	5.0	2020-12-31 00:00:00+00:00	4271.0	20201231.0	10.0	...	3.4	83.0	4.4	2.3	0.9	eor	Cold	Heizgradtag	nicht alle Parameter korrigiert	5
2	2021-01-01 00:00:00+00:00	0.0	4080.0	1221.0	290.0	1.0	2021-01-01 00:00:00+00:00	4271.0	20210101.0	10.0	...	2.0	90.0	3.0	1.1	0.3	eor	Cold	Heizgradtag	nicht alle Parameter korrigiert	5
3	2021-01-02 00:00:00+00:00	1170.0	2630.0	1243.0	284.0	2.0	2021-01-02 00:00:00+00:00	4271.0	20210102.0	10.0	...	3.3	95.0	4.0	2.6	2.0	eor	Cold	Heizgradtag	nicht alle Parameter korrigiert	5
4	2021-01-03 00:00:00+00:00	0.0	3750.0	1222.0	283.0	2.0	2021-01-03 00:00:00+00:00	4271.0	20210103.0	10.0	...	3.5	81.0	4.6	2.4	0.8	eor	Cold	Heizgradtag	nicht alle Parameter korrigiert	5

Dep. Variable:	ES_Lab	R-squared:	0.381
Model:	OLS	Adj. R-squared:	0.380
Method:	Least Squares	F-statistic:	663.7
Date:	Sat, 19 Jul 2025	Prob (F-statistic):	1.79e-114
Time:	20:54:21	Log-Likelihood:	-5107.2
No. Observations:	1082	AIC:	1.022e+04
Df Residuals:	1080	BIC:	1.023e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	7.7784	1.540	5.050	0.000	4.756	10.801
TMK	3.1853	0.124	25.762	0.000	2.943	3.428

Omnibus:	61.376	Durbin-Watson:	0.603
Prob(Omnibus):	0.000	Jarque-Bera (JB):	68.207
Skew:	0.596	Prob(JB):	1.55e-15
Kurtosis:	2.696	Cond. No.	23.3

Dep. Variable:	ES_Lab	R-squared:	0.872
Model:	OLS	Adj. R-squared:	0.871
Method:	Least Squares	F-statistic:	1754.
Date:	Sat, 19 Jul 2025	Prob (F-statistic):	0.00
Time:	20:54:21	Log-Likelihood:	-4088.9
No. Observations:	1039	AIC:	8188.
Df Residuals:	1034	BIC:	8212.
Df Model:	4
Covariance Type:	nonrobust

Univariate Lineare Regression¶

Bestimmen der Parameter für zwei Punkte¶

Bestimmen der Parameter für mehrere Punkte¶

Beispiel¶

ML-Frameworks für Lineare Regression¶

SciKit-Learn¶

Statsmodels¶

Multivariate Lineare Regression¶

SciKit-Learn¶

Statsmodels¶

Umgang mit kategorischen Eingangsvariablen und Kombinationen¶

Modellqualität¶

Residuen¶

Mean Square Error (MSE)¶

Root Mean Squared Error (RMSE)¶

Mean Absolute Error (MAE)¶

Mean Absolute Percentage Error (MAPE)¶

Bestimmungskoeffizient $R^2$¶

Omnibus:	72.688	Durbin-Watson:	1.079
Prob(Omnibus):	0.000	Jarque-Bera (JB):	184.271
Skew:	-0.378	Prob(JB):	9.68e-41
Kurtosis:	4.920	Cond. No.	140.

Omnibus:	70.515	Durbin-Watson:	1.075
Prob(Omnibus):	0.000	Jarque-Bera (JB):	176.111
Skew:	-0.370	Prob(JB):	5.73e-39
Kurtosis:	4.877	Cond. No.	156.

Omnibus:	64.597	Durbin-Watson:	1.101
Prob(Omnibus):	0.000	Jarque-Bera (JB):	184.425
Skew:	-0.282	Prob(JB):	8.97e-41
Kurtosis:	4.985	Cond. No.	777.

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-20.6112	2.932	-7.030	0.000	-26.364	-14.858
TMK	1.9978	0.202	9.884	0.000	1.601	2.394
SDK	6.9366	0.180	38.489	0.000	6.583	7.290
NM	3.6909	0.336	10.986	0.000	3.032	4.350
VPM	-1.5711	0.295	-5.333	0.000	-2.149	-0.993

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-20.8895	3.053	-6.842	0.000	-26.881	-14.898
Weekday[T.Monday]	0.0295	1.446	0.020	0.984	-2.809	2.868
Weekday[T.Saturday]	-0.7237	1.443	-0.502	0.616	-3.555	2.107
Weekday[T.Sunday]	0.7229	1.449	0.499	0.618	-2.120	3.566
Weekday[T.Thursday]	-0.2152	1.442	-0.149	0.881	-3.045	2.615
Weekday[T.Tuesday]	1.3861	1.442	0.961	0.337	-1.444	4.216
Weekday[T.Wednesday]	0.4161	1.445	0.288	0.773	-2.419	3.251
TMK	2.0007	0.203	9.874	0.000	1.603	2.398
SDK	6.9364	0.181	38.341	0.000	6.581	7.291
NM	3.6994	0.337	10.962	0.000	3.037	4.362
VPM	-1.5740	0.295	-5.328	0.000	-2.154	-0.994

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-20.8600	3.111	-6.705	0.000	-26.965	-14.755
C(WeekdayN)[T.1]	1.3566	1.445	0.939	0.348	-1.479	4.192
C(WeekdayN)[T.2]	0.3866	1.447	0.267	0.789	-2.452	3.225
C(WeekdayN)[T.3]	-0.2447	1.443	-0.170	0.865	-3.077	2.588
C(WeekdayN)[T.4]	-0.0295	1.446	-0.020	0.984	-2.868	2.809
C(WeekdayN)[T.5]	-0.7533	1.444	-0.522	0.602	-3.586	2.080
C(WeekdayN)[T.6]	0.6934	1.448	0.479	0.632	-2.148	3.535
TMK	2.0007	0.203	9.874	0.000	1.603	2.398
SDK	6.9364	0.181	38.341	0.000	6.581	7.291
NM	3.6994	0.337	10.962	0.000	3.037	4.362
VPM	-1.5740	0.295	-5.328	0.000	-2.154	-0.994

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-18.3083	3.315	-5.523	0.000	-24.813	-11.803
C(WeekdayN)[T.1]	2.1577	1.754	1.230	0.219	-1.284	5.600
C(WeekdayN)[T.2]	1.0729	1.769	0.606	0.544	-2.398	4.544
C(WeekdayN)[T.3]	0.3539	1.752	0.202	0.840	-3.084	3.791
C(WeekdayN)[T.4]	0.2426	1.761	0.138	0.890	-3.213	3.698
C(WeekdayN)[T.5]	-0.3289	1.747	-0.188	0.851	-3.758	3.100
C(WeekdayN)[T.6]	-0.3372	1.755	-0.192	0.848	-3.781	3.106
HeizKuehlTage[T.Kühlgradtag]	3.9698	5.915	0.671	0.502	-7.637	15.577
HeizKuehlTage[T.Normaltag]	5.2019	2.512	2.071	0.039	0.272	10.132
C(WeekdayN)[T.1]:HeizKuehlTage[T.Kühlgradtag]	-3.4876	9.169	-0.380	0.704	-21.480	14.505
C(WeekdayN)[T.2]:HeizKuehlTage[T.Kühlgradtag]	-18.8997	9.195	-2.055	0.040	-36.943	-0.856
C(WeekdayN)[T.3]:HeizKuehlTage[T.Kühlgradtag]	-5.9965	9.175	-0.654	0.514	-24.001	12.008
C(WeekdayN)[T.4]:HeizKuehlTage[T.Kühlgradtag]	-15.5864	8.474	-1.839	0.066	-32.214	1.042
C(WeekdayN)[T.5]:HeizKuehlTage[T.Kühlgradtag]	-4.4747	7.990	-0.560	0.576	-20.154	11.204
C(WeekdayN)[T.6]:HeizKuehlTage[T.Kühlgradtag]	-0.5844	7.663	-0.076	0.939	-15.621	14.452
C(WeekdayN)[T.1]:HeizKuehlTage[T.Normaltag]	-2.1827	3.128	-0.698	0.486	-8.322	3.956
C(WeekdayN)[T.2]:HeizKuehlTage[T.Normaltag]	-1.1488	3.096	-0.371	0.711	-7.225	4.927
C(WeekdayN)[T.3]:HeizKuehlTage[T.Normaltag]	-1.3698	3.121	-0.439	0.661	-7.494	4.755
C(WeekdayN)[T.4]:HeizKuehlTage[T.Normaltag]	0.6019	3.118	0.193	0.847	-5.516	6.720
C(WeekdayN)[T.5]:HeizKuehlTage[T.Normaltag]	-0.4470	3.157	-0.142	0.887	-6.643	5.749
C(WeekdayN)[T.6]:HeizKuehlTage[T.Normaltag]	4.1547	3.175	1.309	0.191	-2.075	10.385
TMK	1.9744	0.204	9.677	0.000	1.574	2.375
SDK	6.8849	0.182	37.827	0.000	6.528	7.242
NM	3.6449	0.336	10.851	0.000	2.986	4.304
VPM	-1.9031	0.309	-6.158	0.000	-2.510	-1.297