统计分析
统计分析是数据分析的核心,用于从数据中提取有意义的信息和模式。
描述统计
基本统计量
python
import pandas as pd
import numpy as np
# 创建数据
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# 均值
mean = data.mean()
# 中位数
median = data.median()
# 众数
mode = data.mode()[0]
# 标准差
std = data.std()
# 方差
var = data.var()
# 最小值和最大值
min_val = data.min()
max_val = data.max()
# 四分位数
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
# 极差
range_val = max_val - min_val
# 描述统计摘要
print(data.describe())频数分布
python
# 频数统计
frequency = data.value_counts()
# 频率统计
frequency_percent = data.value_counts(normalize=True) * 100
# 分组统计
bins = [0, 3, 6, 10]
grouped = pd.cut(data, bins=bins)
group_counts = grouped.value_counts().sort_index()
print(group_counts)假设检验
t 检验
python
from scipy import stats
# 单样本 t 检验
sample = [1.2, 1.5, 1.3, 1.4, 1.6]
t_stat, p_value = stats.ttest_1samp(sample, popmean=1.0)
print(f"单样本 t 检验: t={t_stat:.4f}, p={p_value:.4f}")
# 独立样本 t 检验
group1 = [1.2, 1.5, 1.3, 1.4, 1.6]
group2 = [1.1, 1.2, 1.0, 1.3, 1.2]
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"独立样本 t 检验: t={t_stat:.4f}, p={p_value:.4f}")
# 配对样本 t 检验
before = [100, 105, 98, 102, 101]
after = [95, 100, 96, 98, 99]
t_stat, p_value = stats.ttest_rel(before, after)
print(f"配对样本 t 检验: t={t_stat:.4f}, p={p_value:.4f}")方差分析
python
# 单因素方差分析
group1 = [1, 2, 3, 4, 5]
group2 = [2, 3, 4, 5, 6]
group3 = [3, 4, 5, 6, 7]
f_stat, p_value = stats.f_oneway(group1, group2, group3)
print(f"单因素方差分析: F={f_stat:.4f}, p={p_value:.4f}")回归分析
线性回归
python
from sklearn.linear_model import LinearRegression
import numpy as np
# 准备数据
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 5, 4, 5])
# 创建模型
model = LinearRegression()
# 训练模型
model.fit(X, y)
# 预测
y_pred = model.predict(X)
# 系数
slope = model.coef_[0]
intercept = model.intercept_
print(f"斜率: {slope:.4f}")
print(f"截距: {intercept:.4f}")
print(f"预测值: {y_pred}")多元线性回归
python
# 多元回归数据
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([3, 5, 6, 8, 9])
# 创建模型
model = LinearRegression()
# 训练模型
model.fit(X, y)
# 系数
coefficients = model.coef_
intercept = model.intercept_
print(f"系数: {coefficients}")
print(f"截距: {intercept:.4f}")模型评估
python
from sklearn.metrics import mean_squared_error, r2_score
# 计算 MSE
mse = mean_squared_error(y, y_pred)
# 计算 R²
r2 = r2_score(y, y_pred)
print(f"均方误差: {mse:.4f}")
print(f"R² 得分: {r2:.4f}")相关性分析
皮尔逊相关系数
python
import pandas as pd
# 创建数据框
df = pd.DataFrame({
'x': [1, 2, 3, 4, 5],
'y': [2, 4, 5, 4, 5],
'z': [5, 4, 3, 2, 1]
})
# 计算相关矩阵
correlation_matrix = df.corr()
print(correlation_matrix)散点矩阵
python
import seaborn as sns
import matplotlib.pyplot as plt
# 绘制散点矩阵
sns.pairplot(df)
plt.show()热力图
python
# 绘制相关热力图
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()