首页 > 大数据 > 数据挖掘 > 如何赢得英雄联盟LOL?游戏获胜与哪些特征更相关
# 导入包和数据import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns
%matplotlib inlinesns.set_style('darkgrid')
df = pd.read_csv('/home/kesci/input/lol8974/high_diamond_ranked_10min.csv')df.head()
# 检查缺少的值和数据类型df.info()
df_clean = df.copy()
# #删除一些不必要的列。例如,blueFirstblood/redfirst blood blueEliteMonster/redEliteMonster blueDeath/redKills等重复cols = ['gameId', 'redFirstBlood', 'redKills', 'redEliteMonsters', 'redDragons','redTotalMinionsKilled',
'redTotalJungleMinionsKilled', 'redGoldDiff', 'redExperienceDiff', 'redCSPerMin', 'redGoldPerMin', 'redHeralds',
'blueGoldDiff', 'blueExperienceDiff', 'blueCSPerMin', 'blueGoldPerMin', 'blueTotalMinionsKilled']df_clean = df_clean.drop(cols, axis = 1)
df_clean.info()
# 接下来让我们检查blue team特征参数之间的关系g = sns.PairGrid(data=df_clean, vars=['blueKills', 'blueAssists', 'blueWardsPlaced', 'blueTotalGold'], hue='blueWins', size=3, palette='Set1')g.map_diag(plt.hist)g.map_offdiag(plt.scatter)g.add_legend();
# 我们可以看到很多特征是高度相关的,让我们得到相关矩阵plt.figure(figsize=(16, 12))sns.heatmap(df_clean.drop('blueWins', axis=1).corr(), cmap='YlGnBu', annot=True, fmt='.2f', vmin=0);
# 基于相关性矩阵,让我们稍微清理一下数据集,以避免共线性cols = ['blueAvgLevel', 'redWardsPlaced', 'redWardsDestroyed', 'redDeaths', 'redAssists', 'redTowersDestroyed',
'redTotalExperience', 'redTotalGold', 'redAvgLevel']df_clean = df_clean.drop(cols, axis=1)
# 接下来让我们删除与bluewins关系不大的列corr_list = df_clean[df_clean.columns[1:]].apply(lambda x: x.corr(df_clean['blueWins']))cols = []for col in corr_list.index:
if (corr_list[col]>0.2 or corr_list[col]<-0.2):
cols.append(col)cols
df_clean = df_clean[cols]df_clean.head()
df_clean.hist(alpha = 0.7, figsize=(12,10), bins=5);
from sklearn.preprocessing import MinMaxScalerfrom sklearn.model_selection import train_test_splitX = df_cleany = df['blueWins']scaler = MinMaxScaler()scaler.fit(X)X = scaler.transform(X)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.naive_bayes import GaussianNBfrom sklearn.metrics import accuracy_score# 拟合模型clf_nb = GaussianNB()clf_nb.fit(X_train, y_train)pred_nb = clf_nb.predict(X_test)# 得到准确性分数acc_nb = accuracy_score(pred_nb, y_test)print(acc_nb)
# 拟合决策树模型from sklearn import treefrom sklearn.model_selection import GridSearchCVtree = tree.DecisionTreeClassifier()# 搜索最好的参数grid = {'min_samples_split': [5, 10, 20, 50, 100]},clf_tree = GridSearchCV(tree, grid, cv=5)clf_tree.fit(X_train, y_train)pred_tree = clf_tree.predict(X_test)# 获得准确度分数acc_tree = accuracy_score(pred_tree, y_test)print(acc_tree)
# 拟合模型from sklearn.ensemble import RandomForestClassifierrf = RandomForestClassifier()# 搜索最好的参数grid = {'n_estimators':[100,200,300,400,500], 'max_depth': [2, 5, 10]}clf_rf = GridSearchCV(rf, grid, cv=5)clf_rf.fit(X_train, y_train)pred_rf = clf_rf.predict(X_test)# 获得准确分数acc_rf = accuracy_score(pred_rf, y_test)print(acc_rf)
# 拟合逻辑回归模型from sklearn.linear_model import LogisticRegressionlm = LogisticRegression()lm.fit(X_train, y_train)# 得到准确分数pred_lm = lm.predict(X_test)acc_lm = accuracy_score(pred_lm, y_test)print(acc_lm)
# 拟合模型from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier() # 搜索最好的参数grid = {"n_neighbors":np.arange(1,100)}clf_knn = GridSearchCV(knn, grid, cv=5)clf_knn.fit(X_train,y_train) # 得到准确分数pred_knn = clf_knn.predict(X_test) acc_knn = accuracy_score(pred_knn, y_test)print(acc_knn)
data_dict = {'Naive Bayes': [acc_nb], 'DT': [acc_tree], 'Random Forest': [acc_rf], 'Logistic Regression': [acc_lm], 'K_nearest Neighbors': [acc_knn]}df_c = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Accuracy Score'])print(df_c)
# 召回率和精确率from sklearn.metrics import recall_score, precision_score# lm参数 recall_lm = recall_score(pred_lm, y_test, average = None)precision_lm = precision_score(pred_lm, y_test, average = None)print('precision score for naive bayes: {}\n recall score for naive bayes:{}'.format(precision_lm, recall_lm))
# rf参数recall_rf = recall_score(pred_rf, y_test, average = None)precision_rf = precision_score(pred_rf, y_test, average = None)print('precision score for naive bayes: {}\n recall score for naive bayes:{}'.format(precision_rf, recall_rf))
df_clean.columns
lm.coef_
np.exp(lm.coef_)
coef_data = np.concatenate((lm.coef_, np.exp(lm.coef_)),axis=0)coef_df = pd.DataFrame(data=coef_data, columns=df_clean.columns).T.reset_index().rename(columns={'index': 'Var', 0: 'coef', 1: 'oddRatio'})coef_df.sort_values(by='coef', ascending=False)
# 使用PCA让结果可视化X = df_cleany = df['blueWins']# PCA受scale的影响,首先要对数据集进行scalefrom sklearn import preprocessing # S标准化特征X = preprocessing.StandardScaler().fit_transform(X)
from sklearn.decomposition import PCApca = PCA(n_components=2)components = pca.fit_transform(X)print(pca.explained_variance_ratio_)
# 创造可视化dfdf_vis = pd.DataFrame(data = components, columns = ['pc1', 'pc2'])df_vis = pd.concat([df_vis, df['blueWins']], axis = 1)X = df_vis[['pc1', 'pc2']]y = df_vis['blueWins']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 重新定义pca数据lm.fit(X_train, y_train)
# 可视化函数from matplotlib.colors import ListedColormapdef DecisionBoundary(clf):
X = df_vis[['pc1', 'pc2']]
y = df_vis['blueWins']
h = .02
# 创建颜色映射
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])
# 绘制决策边界。为此,我们将为每一个分配一个颜色
# 网格 [x_min, x_max]x[y_min, y_max].
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.show()
DecisionBoundary(lm)
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/69977871/viewspace-2700669/,如需转载,请注明出处,否则将追究法律责任。