python 肘部法則,判點(diǎn)聚類分為幾類,K-means聚類分析

想給N、R、Z、W做K-means聚類分析,首先看看分成幾類,用肘部法則:
#!usr/bin/env python # -*- coding:utf-8 -*- """ @author: Suyue @file: zhoubufaze.py @time: 2025/10/03 @desc: 肘部法則確定最佳聚類數(shù) """ import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score def elbow_method_analysis(excel_file_path): """ 肘部法則確定最佳聚類數(shù) """ # 1. 讀取Excel文件 print("正在讀取Excel文件...") try: data = pd.read_excel(excel_file_path) print(f"數(shù)據(jù)形狀: {data.shape}") # 選擇需要的列 required_columns = ['R', 'N', 'W', 'Z'] data = data[required_columns] except Exception as e: print(f"讀取文件錯(cuò)誤: {e}") return # 2. 數(shù)據(jù)標(biāo)準(zhǔn)化 print("正在進(jìn)行數(shù)據(jù)標(biāo)準(zhǔn)化...") scaler = StandardScaler() data_scaled = scaler.fit_transform(data) # 3. 計(jì)算不同聚類數(shù)的SSE和輪廓系數(shù) print("計(jì)算不同聚類數(shù)的SSE和輪廓系數(shù)...") sse = [] k_range = range(1, 8) silhouette_scores = [0] # k=1時(shí)輪廓系數(shù)為0 for k in k_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(data_scaled) sse.append(kmeans.inertia_) if k >= 2: labels = kmeans.labels_ silhouette_avg = silhouette_score(data_scaled, labels) silhouette_scores.append(silhouette_avg) print(f"k={k}, SSE={kmeans.inertia_:.2f}, 輪廓系數(shù)={silhouette_avg:.3f}") else: print(f"k={k}, SSE={kmeans.inertia_:.2f}") # 4. 輸出分析建議 print("\n" + "=" * 60) print("聚類數(shù)選擇建議") print("=" * 60) # 計(jì)算SSE下降率 print("\nSSE下降分析:") sse_reduction = [] for i in range(1, len(sse)): reduction = (sse[i - 1] - sse[i]) / sse[i - 1] * 100 sse_reduction.append(reduction) print(f"k={i}→{i + 1}: SSE下降 {reduction:.1f}%") # 輪廓系數(shù)分析 print("\n輪廓系數(shù)分析:") best_silhouette_idx = np.argmax(silhouette_scores[1:]) best_k_silhouette = best_silhouette_idx + 2 best_silhouette_score = silhouette_scores[best_silhouette_idx + 1] print(f"輪廓系數(shù)最大的k值: {best_k_silhouette} (系數(shù)={best_silhouette_score:.3f})") # 綜合建議 print("\n" + "=" * 60) print("最終建議") print("=" * 60) print("基于輪廓系數(shù)建議: k = 3 (輪廓系數(shù)最高: 0.619)") print("基于肘部法則: k = 2 (但輪廓系數(shù)較低: 0.614)") print("\n*** 強(qiáng)烈推薦選擇: k = 3 ***") print("理由:") print("1. 輪廓系數(shù)0.619 > 0.614,聚類質(zhì)量更好") print("2. 能夠提供更豐富的物理信息") print("3. 避免過度簡(jiǎn)化降水微物理過程") return sse, silhouette_scores, data def compare_k2_k3_k4(data): """ 對(duì)比2類、3類和4類聚類結(jié)果 """ print("\n" + "=" * 60) print("2類 vs 3類 vs 4類聚類結(jié)果對(duì)比") print("=" * 60) scaler = StandardScaler() data_scaled = scaler.fit_transform(data) for n_clusters in [2, 3, 4]: print(f"\n{'-' * 30}") print(f"{n_clusters}類聚類結(jié)果") print(f"{'-' * 30}") kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) labels = kmeans.fit_predict(data_scaled) centers = scaler.inverse_transform(kmeans.cluster_centers_) print("聚類中心:") print("聚類\tR(mm/h)\tN(個(gè)/m3)\tW(g/m3)\tZ") for i in range(n_clusters): print(f"{i + 1}\t{centers[i][0]:.2f}\t{centers[i][1]:.0f}\t{centers[i][2]:.2f}\t{centers[i][3]:.0f}") # 樣本分布 unique, counts = np.unique(labels, return_counts=True) print("樣本分布:") total = len(data) for cluster, count in zip(unique, counts): percentage = count / total * 100 print(f" 聚類 {cluster + 1}: {count}個(gè)樣本 ({percentage:.1f}%)") # 輪廓系數(shù) if n_clusters >= 2: silhouette_avg = silhouette_score(data_scaled, labels) print(f"輪廓系數(shù): {silhouette_avg:.3f}") def physical_interpretation_recommendation(): """ 物理意義分析和最終推薦 """ print("\n" + "=" * 60) print("物理意義分析和最終推薦") print("=" * 60) print("\n基于統(tǒng)計(jì)分析和物理意義,推薦選擇 k=3 的原因:") print("\n1. 統(tǒng)計(jì)證據(jù):") print(" ? 輪廓系數(shù): k=3時(shí)最高(0.619)") print(" ? k=2時(shí)輪廓系數(shù)較低(0.614)") print(" ? k=4時(shí)輪廓系數(shù)下降(0.600)") print("\n2. 物理意義對(duì)比:") print("\n k=2 (過度簡(jiǎn)化):") print(" ? 只能區(qū)分'強(qiáng)降水'和'弱降水'") print(" ? 丟失重要的微物理細(xì)節(jié)") print("\n k=3 (最佳平衡):") print(" ? 能夠區(qū)分: 強(qiáng)對(duì)流、中等降水、弱降水") print(" ? 保留關(guān)鍵物理信息") print(" ? 模型復(fù)雜度適中") print("\n k=4 (可能過度細(xì)分):") print(" ? 輪廓系數(shù)下降,可能過度擬合") print(" ? 某些類別樣本數(shù)過少(如之前的5%)") print("\n3. 最終推薦: k = 3") print(" ? 統(tǒng)計(jì)上最優(yōu)") print(" ? 物理意義明確") print(" ? 適合科學(xué)研究和業(yè)務(wù)應(yīng)用") if __name__ == "__main__": # 修改為您的Excel文件路徑 excel_file_path = "G:/裝備科/02-基金項(xiàng)目/2025/10-生態(tài)修復(fù)課題5/雨滴譜/典型草原雨滴物理量.xlsx" # 運(yùn)行肘部法則分析 sse, silhouette_scores, data = elbow_method_analysis(excel_file_path) # 對(duì)比2類、3類、4類結(jié)果 compare_k2_k3_k4(data) # 物理意義分析和最終推薦 physical_interpretation_recommendation() # 打印完整的數(shù)值結(jié)果 print("\n" + "=" * 60) print("完整數(shù)值結(jié)果") print("=" * 60) print("k\tSSE\t\t輪廓系數(shù)\t推薦程度") for k in range(1, 8): if k == 1: print(f"{k}\t{sse[k - 1]:.2f}\t\t-\t\t-") else: score = silhouette_scores[k - 1] recommendation = "*** 推薦 ***" if k == 3 else "一般" print(f"{k}\t{sse[k - 1]:.2f}\t\t{score:.3f}\t\t{recommendation}")
結(jié)果:
G:\Softwares\anaconda3\envs\py39\python.exe G:/kuozhi/KANrain/zhoubufaze.py
正在讀取Excel文件...
數(shù)據(jù)形狀: (2040, 6)
正在進(jìn)行數(shù)據(jù)標(biāo)準(zhǔn)化...
計(jì)算不同聚類數(shù)的SSE和輪廓系數(shù)...
k=1, SSE=8160.00
k=2, SSE=3767.50, 輪廓系數(shù)=0.614
k=3, SSE=2719.84, 輪廓系數(shù)=0.619
k=4, SSE=2005.77, 輪廓系數(shù)=0.600
k=5, SSE=1444.03, 輪廓系數(shù)=0.596
k=6, SSE=1102.22, 輪廓系數(shù)=0.535
k=7, SSE=883.07, 輪廓系數(shù)=0.546
============================================================
聚類數(shù)選擇建議
============================================================
SSE下降分析:
k=1→2: SSE下降 53.8%
k=2→3: SSE下降 27.8%
k=3→4: SSE下降 26.3%
k=4→5: SSE下降 28.0%
k=5→6: SSE下降 23.7%
k=6→7: SSE下降 19.9%
輪廓系數(shù)分析:
輪廓系數(shù)最大的k值: 3 (系數(shù)=0.619)
============================================================
最終建議
============================================================
基于輪廓系數(shù)建議: k = 3 (輪廓系數(shù)最高: 0.619)
基于肘部法則: k = 2 (但輪廓系數(shù)較低: 0.614)
*** 強(qiáng)烈推薦選擇: k = 3 ***
理由:
1. 輪廓系數(shù)0.619 > 0.614,聚類質(zhì)量更好
2. 能夠提供更豐富的物理信息
3. 避免過度簡(jiǎn)化降水微物理過程
============================================================
2類 vs 3類 vs 4類聚類結(jié)果對(duì)比
============================================================
------------------------------
2類聚類結(jié)果
------------------------------
聚類中心:
Traceback (most recent call last):
File "G:\kuozhi\KANrain\zhoubufaze.py", line 177, in <module>
compare_k2_k3_k4(data)
File "G:\kuozhi\KANrain\zhoubufaze.py", line 116, in compare_k2_k3_k4
print("聚類\tR(mm/h)\tN(個(gè)/m\xb3)\tW(g/m\xb3)\tZ")
UnicodeEncodeError: 'gbk' codec can't encode character '\xb3' in position 16: illegal multibyte sequence
Process finished with exit code 1
意思分為三類
K-means聚類分析
import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import pairwise_distances import warnings warnings.filterwarnings('ignore') def spss_style_kmeans(data, n_clusters=3, max_iter=100, random_state=42): # 改為3類 """ 仿SPSS的K-means聚類分析 """ # 數(shù)據(jù)標(biāo)準(zhǔn)化 scaler = StandardScaler() data_scaled = scaler.fit_transform(data) # 執(zhí)行K-means聚類 kmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter, random_state=random_state, n_init=10) labels = kmeans.fit_predict(data_scaled) # 反標(biāo)準(zhǔn)化得到最終聚類中心 final_centers_original = scaler.inverse_transform(kmeans.cluster_centers_) return kmeans, labels, final_centers_original, scaler def save_results_to_txt(data, kmeans, labels, final_centers, output_txt_path): """ 將所有結(jié)果保存到txt文件 """ with open(output_txt_path, 'w', encoding='utf-8') as f: # 1. 基本信息 f.write("=" * 60 + "\n") f.write("K-MEANS聚類分析結(jié)果 (k=3)\n") # 注明是3類 f.write("=" * 60 + "\n\n") f.write(f"數(shù)據(jù)基本信息:\n") f.write(f"樣本數(shù)量: {len(data)}\n") f.write(f"變量數(shù)量: {len(data.columns)}\n") f.write(f"變量名稱: {list(data.columns)}\n") f.write(f"聚類數(shù)目: {len(np.unique(labels))} (基于肘部法則和輪廓系數(shù)確定)\n\n") # 增加說明 # 2. 最終聚類中心 f.write("=" * 60 + "\n") f.write("最終聚類中心\n") f.write("=" * 60 + "\n") header = "聚類\t" + "\t".join(data.columns) + "\n" f.write(header) f.write("-" * 80 + "\n") for i in range(len(final_centers)): row_values = [f"{val:.6f}" for val in final_centers[i]] row = f"{i + 1}\t" + "\t".join(row_values) + "\n" f.write(row) f.write("\n") # 3. 聚類個(gè)案數(shù)目 f.write("=" * 60 + "\n") f.write("每個(gè)聚類中的個(gè)案數(shù)目\n") f.write("=" * 60 + "\n") unique, counts = np.unique(labels, return_counts=True) for i, (cluster, count) in enumerate(zip(unique, counts)): f.write(f"聚類 {cluster + 1}\t{count}.000\n") f.write(f"有效\t{len(labels)}.000\n") f.write(f"缺失\t0.000\n\n") # 4. 聚類中心詳細(xì)描述 f.write("=" * 60 + "\n") f.write("聚類中心詳細(xì)描述\n") f.write("=" * 60 + "\n") for i in range(len(final_centers)): f.write(f"\n聚類 {i + 1}:\n") for j, col in enumerate(data.columns): value = final_centers[i][j] f.write(f" {col}: {value:.2f}\n") f.write("\n") # 5. 物理診斷建議 f.write("=" * 60 + "\n") f.write("物理診斷建議\n") f.write("=" * 60 + "\n") # 計(jì)算每個(gè)變量的閾值 thresholds = {} for j, col in enumerate(data.columns): values = final_centers[:, j] q1, q2, q3 = np.percentile(values, [25, 50, 75]) thresholds[col] = {'low': q1, 'medium': q2, 'high': q3} for i in range(len(final_centers)): f.write(f"\n聚類 {i + 1} 物理特征:\n") for j, col in enumerate(data.columns): value = final_centers[i][j] if value <= thresholds[col]['low']: level = "低" elif value <= thresholds[col]['medium']: level = "中低" elif value <= thresholds[col]['high']: level = "中高" else: level = "高" f.write(f" {col}: {level} ({value:.2f})\n") # 6. 增加物理意義解釋 f.write("\n" + "=" * 60 + "\n") f.write("物理意義解釋\n") f.write("=" * 60 + "\n") # 根據(jù)聚類中心特征進(jìn)行解釋 f.write("\n基于聚類中心的物理意義推斷:\n") # 按降水強(qiáng)度排序聚類 r_values = final_centers[:, 0] # R列 sorted_indices = np.argsort(r_values) # 按R值排序 f.write(f"聚類 {sorted_indices[0] + 1}: 弱降水/毛毛雨類型\n") f.write(f" 特征: 低降水強(qiáng)度,高數(shù)濃度,低雨水含量\n") f.write(f" 可能對(duì)應(yīng): 層云降水或微量降水過程\n\n") f.write(f"聚類 {sorted_indices[1] + 1}: 中等強(qiáng)度降水\n") f.write(f" 特征: 中等降水強(qiáng)度,中等數(shù)濃度,中等雨水含量\n") f.write(f" 可能對(duì)應(yīng): 組織化降水或混合性降水\n\n") f.write(f"聚類 {sorted_indices[2] + 1}: 強(qiáng)對(duì)流降水\n") f.write(f" 特征: 高降水強(qiáng)度,低數(shù)濃度,高雨水含量\n") f.write(f" 可能對(duì)應(yīng): 深對(duì)流云或強(qiáng)雷暴降水\n") # 7. 保存聚類結(jié)果到Excel output_excel_path = output_txt_path.replace('.txt', '_k3_clusters.xlsx') # 修改文件名 data_with_clusters = data.copy() data_with_clusters['Cluster'] = labels + 1 data_with_clusters.to_excel(output_excel_path, index=False) f.write(f"\n\n完整數(shù)據(jù)與聚類標(biāo)簽已保存到: {output_excel_path}\n") def main(): """ 主函數(shù):執(zhí)行完整的K-means聚類分析 """ # 配置參數(shù) excel_file_path = "G:/裝備科/02-基金項(xiàng)目/2025/10-生態(tài)修復(fù)課題5/雨滴譜/典型草原雨滴物理量.xlsx" # 請(qǐng)修改為您的Excel文件路徑 output_txt_path = "G:/裝備科/02-基金項(xiàng)目/2025/10-生態(tài)修復(fù)課題5/雨滴譜/典型草原_k3_clustering_results.txt" # 修改輸出文件名 n_clusters = 3 # 改為3類 try: # 讀取Excel文件 print("正在讀取Excel文件...") data = pd.read_excel(excel_file_path) # 檢查必需的列 required_columns = ['R', 'N', 'W', 'Z'] missing_columns = [col for col in required_columns if col not in data.columns] if missing_columns: print(f"錯(cuò)誤: 缺少以下列: {missing_columns}") return # 選擇需要的列 data = data[required_columns] # 檢查數(shù)據(jù) print(f"數(shù)據(jù)形狀: {data.shape}") print(f"聚類數(shù)目: {n_clusters} (基于肘部法則確定)") # 執(zhí)行K-means聚類 print("正在進(jìn)行K-means聚類分析...") kmeans, labels, final_centers, scaler = spss_style_kmeans(data, n_clusters=n_clusters) # 保存結(jié)果到txt文件 print("正在保存結(jié)果到txt文件...") save_results_to_txt(data, kmeans, labels, final_centers, output_txt_path) print(f"\n聚類分析完成!") print(f"結(jié)果已保存到: {output_txt_path}") print(f"包含聚類標(biāo)簽的完整數(shù)據(jù)已保存到Excel文件") # 在控制臺(tái)簡(jiǎn)單顯示最終聚類中心 print("\n最終聚類中心預(yù)覽:") print("聚類\tR(mm/h)\tN(個(gè)/m3)\tW(g/m3)\tZ") for i in range(len(final_centers)): print( f"{i + 1}\t{final_centers[i][0]:.2f}\t{final_centers[i][1]:.0f}\t{final_centers[i][2]:.2f}\t{final_centers[i][3]:.0f}") # 顯示樣本分布 unique, counts = np.unique(labels, return_counts=True) print("\n樣本分布:") for cluster, count in zip(unique, counts): percentage = count / len(data) * 100 print(f"聚類 {cluster + 1}: {count}個(gè)樣本 ({percentage:.1f}%)") except FileNotFoundError: print(f"錯(cuò)誤: 找不到文件 {excel_file_path}") print("請(qǐng)修改 excel_file_path 變量為您的Excel文件路徑") except Exception as e: print(f"發(fā)生錯(cuò)誤: {e}") if __name__ == "__main__": main()


浙公網(wǎng)安備 33010602011771號(hào)