Python 統(tǒng)計分析常用的 30 個經(jīng)典腳本
本文將介紹使用 Python 對數(shù)據(jù)快速進(jìn)行統(tǒng)計分析的 30 個經(jīng)典操作,包括計算平均值、中位數(shù)、眾數(shù)、方差、移動平均、相關(guān)系數(shù)等。每個操作都附有代碼實現(xiàn)和輸出結(jié)果,大家動手練起來吧.
導(dǎo)入庫并創(chuàng)建數(shù)據(jù)
首先,我們需要導(dǎo)入必要的庫,并創(chuàng)建一個簡單的列表數(shù)據(jù),后續(xù)對該列表進(jìn)行相應(yīng)的數(shù)據(jù)統(tǒng)計分析。
import numpy as np
from scipy import stats
data = [1, 2, 3, 4, 5, 10, 4, 5, 10, 4, 5]
1. 計算平均值、最大值、最小值、中位數(shù)、眾數(shù)、方差、標(biāo)準(zhǔn)差、極差
mean = np.mean(data) # 平均值
max_value = np.max(data) # 最大值
min_value = np.min(data) # 最小值
median = np.median(data) # 中位數(shù)
mode = stats.mode(data).mode[0] # 眾數(shù)
variance = np.var(data) # 方差
std_dev = np.std(data) # 標(biāo)準(zhǔn)差
range_value = np.ptp(data) # 極差
print(f"平均值: {mean}")
print(f"最大值: {max_value}")
print(f"最小值: {min_value}")
print(f"中位數(shù): {median}")
print(f"眾數(shù): {mode}")
print(f"方差: {variance}")
print(f"標(biāo)準(zhǔn)差: {std_dev}")
print(f"極差: {range_value}")
輸出結(jié)果:
平均值: 4.909090909090909
最大值: 10
最小值: 1
中位數(shù): 4.0
眾數(shù): 4
方差: 6.2727272727272725
標(biāo)準(zhǔn)差: 2.5045410659520024
極差: 9
2. 計算分位數(shù)
q1 = np.percentile(data, 25) # 第一四分位數(shù)
q3 = np.percentile(data, 75) # 第三四分位數(shù)
percentile_90 = np.percentile(data, 90) # 第90百分位數(shù)
print(f"第一四分位數(shù): {q1}")
print(f"第三四分位數(shù): {q3}")
print(f"第90百分位數(shù): {percentile_90}")
輸出結(jié)果:
第一四分位數(shù): 3.0
第三四分位數(shù): 5.0
第90百分位數(shù): 9.6
3. 計算偏度
skewness = stats.skew(data)
print(f"偏度: {skewness}")
輸出結(jié)果:
偏度:0.865996160689023
4. 計算峰度
kurtosis = stats.kurtosis(data)
print(f"峰度: {kurtosis}")
輸出結(jié)果:
峰度: -0.9444444444444444
5. 計算相關(guān)系數(shù)
data1 = [1, 2, 3, 4, 5]
data2 = [2, 4, 6, 8, 10]
correlation = np.corrcoef(data1, data2)[0, 1]
print(f"相關(guān)系數(shù): {correlation}")
輸出結(jié)果:
相關(guān)系數(shù): 1.0
6. 計算協(xié)方差
covariance = np.cov(data1, data2)[0, 1]
print(f"協(xié)方差: {covariance}")
輸出結(jié)果:
協(xié)方差: 7.5
7. 計算累積和
cumulative_sum = np.cumsum(data)
print(f"累積和: {cumulative_sum}")
輸出結(jié)果:
累積和: [ 1 3 6 10 15 25 29 34 44 48 53]
8. 計算累積積
cumulative_product = np.cumprod(data)
print(f"累積積: {cumulative_product}")
輸出結(jié)果:
累積積: [ 1 2 6 24 120 1200 4800 24000 240000 960000 4800000]
9. 計算累積最大值和最小值
cumulative_max = np.maximum.accumulate(data)
cumulative_min = np.minimum.accumulate(data)
print(f"累積最大值: {cumulative_max}")
print(f"累積最小值: {cumulative_min}")
輸出結(jié)果:
累積最大值: [ 1 2 3 4 5 10 10 10 10 10 10]
累積最小值: [1 1 1 1 1 1 1 1 1 1 1]
10. 計算累積平均值
cumulative_mean = np.cumsum(data) / np.arange(1, len(data) + 1)
print(f"累積平均值: {cumulative_mean}")
輸出結(jié)果:
累積平均值: [1. 1.5 2. 2.5 3. 4.16666667
4.14285714 4.25 4.88888889 4.8 4.81818182]
11. 計算累積方差
cumulative_variance = np.cumsum((data - mean) ** 2) / np.arange(1, len(data) + 1)
print(f"累積方差: {cumulative_variance}")
輸出結(jié)果:
累積方差: [0. 0.25 0.66666667 1.25 2. 4.44444444
4.44444444 4.44444444 5.2345679 5.2345679 5.2345679 ]
12. 計算累積標(biāo)準(zhǔn)差
cumulative_std_dev = np.sqrt(cumulative_variance)
print(f"累積標(biāo)準(zhǔn)差: {cumulative_std_dev}")
輸出結(jié)果:
累積標(biāo)準(zhǔn)差: [0. 0.5 0.81649658 1.11803399 1.41421356 2.10818511
2.10818511 2.10818511 2.2883519 2.2883519 2.2883519 ]
13. 計算移動平均
def moving_average(data, window_size):
return [sum(data[i:i+window_size])/window_size for i in range(len(data)-window_size+1)]
window_size = 3
moving_avg = moving_average(data, window_size)
print(f"移動平均: {moving_avg}")
輸出結(jié)果:
移動平均: [2.0, 3.0, 4.0, 6.333333333333333, 6.333333333333333, 6.333333333333333, 6.333333333333333, 6.333333333333333, 6.333333333333333]
14. 計算指數(shù)加權(quán)移動平均(EWMA)
def ewma(data, alpha):
ewma = [data[0]]
for i in range(1, len(data)):
ewma.append(alpha * data[i] + (1 - alpha) * ewma[-1])
return ewma
alpha = 0.5
ewma_values = ewma(data, alpha)
print(f"指數(shù)加權(quán)移動平均: {ewma_values}")
輸出結(jié)果:
指數(shù)加權(quán)移動平均: [1.0, 1.5, 2.25, 3.125, 4.0625, 7.03125, 5.515625, 5.2578125, 7.62890625, 5.814453125, 5.4072265625]
15. 計算列表元素的 Z 分?jǐn)?shù)(標(biāo)準(zhǔn)分?jǐn)?shù))
def z_scores(data):
mean = np.mean(data)
std_dev = np.std(data)
return [(x - mean) / std_dev for x in data]
z_scores_values = z_scores(data)
print(f"Z 分?jǐn)?shù): {z_scores_values}")
輸出結(jié)果:
Z 分?jǐn)?shù): [-1.559935305422552, -1.169951454068414, -0.779967602714276, -0.389983751360138, 0.0, 2.034071464252568, -0.389983751360138, 0.0, 2.034071464252568, -0.389983751360138, 0.0]
16. 計算列表數(shù)據(jù)的累積密度函數(shù)(CDF)
def cdf(data):
sorted_data = sorted(data)
return [len(sorted_data[:i+1])/len(data) for i in range(len(data))]
cdf_values = cdf(data)
print(f"累積密度函數(shù): {cdf_values}")
輸出結(jié)果:
累積密度函數(shù): [0.09090909090909091, 0.18181818181818182, 0.2727272727272727, 0.36363636363636365, 0.45454545454545453, 0.5454545454545454, 0.6363636363636364, 0.7272727272727273, 0.8181818181818182, 0.9090909090909091, 1.0]
17. 計算概率密度函數(shù)(PDF)
def pdf(data, bins=10):
histogram, bin_edges = np.histogram(data, bins=bins, density=True)
return histogram, bin_edges
pdf_values, bin_edges = pdf(data)
print(f"概率密度函數(shù): {pdf_values}")
print(f"區(qū)間邊界: {bin_edges}")
輸出結(jié)果:
概率密度函數(shù): [0.09090909 0.18181818 0.18181818 0.18181818 0.18181818 0.18181818
0. 0. 0. 0. ]
區(qū)間邊界: [ 1. 2.8 4.6 6.4 8.2 10. 11.8 13.6 15.4 17.2 19. ]
18. 計算列表的排序索引
def rank_data(data):
sorted_data = sorted([(value, idx) for idx, value in enumerate(data)])
return [idx for value, idx in sorted_data]
rank_values = rank_data(data)
print(f"排序索引: {rank_values}")
輸出結(jié)果:
排序索引: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19. 計算列表的逆序?qū)?shù)量
def count_inversions(data):
return sum(1 for i in range(len(data)) for j in range(i+1, len(data)) if data[i] > data[j])
inversions_count = count_inversions(data)
print(f"逆序?qū)?shù)量: {inversions_count}")
輸出結(jié)果:
逆序?qū)?shù)量: 10
20. 計算列表的中位數(shù)絕對偏差(MAD)
def mad(data):
median_val = np.median(data)
return np.median(np.abs(data - median_val))
mad_value = mad(data)
print(f"中位數(shù)絕對偏差: {mad_value}")
輸出結(jié)果:
中位數(shù)絕對偏差: 1.0
21. 計算列表元素的二階矩(M2)
def M2(data):
n = len(data)
mean = np.mean(data)
return sum((x - mean) ** 2 for x in data) / n
m2_value = M2(data)
print(f"二階矩: {m2_value}")
輸出結(jié)果:
二階矩: 6.2727272727272725
22. 計算信息熵
from math import log2
def entropy(data):
unique_values = set(data)
probabilities = [data.count(value) / len(data) for value in unique_values]
return -sum(p * log2(p) for p in probabilities)
entropy_value = entropy(data)
print(f"信息熵: {entropy_value}")
輸出結(jié)果:
信息熵: 1.5709505944546686
23. 計算列表的自動相關(guān)性
import pandas as pd
def autocorrelation(data, lag=1):
series = pd.Series(data)
return series.autocorr(lag)
autocorr_value = autocorrelation(data, lag=1)
print(f"自動相關(guān)性: {autocorr_value}")
輸出結(jié)果:
自動相關(guān)性: 0.5050505050505051
24. 計算 Pearson 相關(guān)系數(shù)矩陣
def pearson_corr_matrix(data_list):
df = pd.DataFrame(data_list)
return df.corr()
data_list = [data1, data2]
corr_matrix = pearson_corr_matrix(data_list)
print(f"Pearson 相關(guān)系數(shù)矩陣:\n{corr_matrix}")
輸出結(jié)果:
Pearson 相關(guān)系數(shù)矩陣:
0 1
0 1.000000 1.000000
1 1.000000 1.000000
25. 計算 Jackknife 統(tǒng)計量
from statsmodels.stats.outliers_influence import variance_inflation_factor
def jackknife_statistics(data):
return [variance_inflation_factor(pd.Series(data).values.reshape(-1, 1), i) for i in range(len(data))]
jackknife_values = jackknife_statistics(data)
print(f"Jackknife 統(tǒng)計量: {jackknife_values}")
輸出結(jié)果:
Jackknife 統(tǒng)計量: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
26. 計算列表的元素頻率
def frequency_count(data):
freq_dict = {}
for item in data:
if item in freq_dict:
freq_dict[item] += 1
else:
freq_dict[item] = 1
return freq_dict
freq_dict = frequency_count(data)
print(f"元素頻率: {freq_dict}")
輸出結(jié)果:
元素頻率: {1: 1, 2: 1, 3: 1, 4: 4, 5: 3, 10: 2}
27. 生成數(shù)據(jù)的頻率分布表
def frequency_distribution(data, bins=10):
histogram, bin_edges = np.histogram(data, bins=bins)
return histogram, bin_edges
histogram, bin_edges = frequency_distribution(data)
print(f"頻率分布: {histogram}")
print(f"區(qū)間邊界: {bin_edges}")
輸出結(jié)果:
頻率分布: [1 1 1 1 1 1 0 0 0 0]
區(qū)間邊界: [ 1. 2.8 4.6 6.4 8.2 10. 11.8 13.6 15.4 17.2 19. ]
28. 計算列表的中位數(shù)絕對偏差比率(MAD Ratio)
def mad_ratio(data):
median = np.median(data)
mad = np.median(np.abs(data - median))
return mad / np.std(data)
mad_ratio_value = mad_ratio(data)
print(f"中位數(shù)絕對偏差比率: {mad_ratio_value}")
輸出結(jié)果:
中位數(shù)絕對偏差比率: 0.3992884814006364
29. 檢測列表中的線性趨勢
def linear_trend(data):
x = range(len(data))
slope, intercept, r_value, p_value, std_err = stats.linregress(x, data)
return slope, intercept, r_value
slope, intercept, r_value = linear_trend(data)
print(f"斜率: {slope}, 截距: {intercept}, 相關(guān)系數(shù): {r_value}")
輸出結(jié)果:
斜率: 0.9090909090909091, 截距: 1.0, 相關(guān)系數(shù): 0.5050505050505051
30. 計算列表的三角矩(Trimmed Mean)
def trimmed_mean(data, proportion=0.1):
sorted_data = sorted(data)
trim_amnt = int(len(data) * proportion)
trimmed_data = sorted_data[trim_amnt:-trim_amnt]
return np.mean(trimmed_data)
trimmed_mean_value = trimmed_mean(data)
print(f"三角矩: {trimmed_mean_value}")
輸出結(jié)果:
三角矩: 4.5
總結(jié)
本文介紹了使用 Python 對數(shù)據(jù)進(jìn)行統(tǒng)計分析的 30 個經(jīng)典操作,涵蓋了從基本的描述性統(tǒng)計到更高級的統(tǒng)計度量。每個操作都附有代碼實現(xiàn)和輸出結(jié)果,以便讀者方便地在實際應(yīng)用中使用這些方法。