
dataset["HomeLastWin"] = False
dataset["VisitorLastWin"] = False
# This creates two new columns, all set to False
from collections import defaultdict
won_last = defaultdict(int)
for index,row in dataset.iterrows():
home_team = row['Home Team']
visitor_team = row['Visitor Team']
row["HomeLastWin"] = won_last[home_team]
row['VisitorLastWin'] = won_last[visitor_team]
dataset.ix[index] = row
won_last[home_team] = row["HomeWin"]
won_last[visitor_team] = not row["HomeWin"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
scores = cross_val_score(clf, X_previouswins, y_true, scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
2 根据上个赛季的比赛情况,这是需要用到上个赛季的数据,数据的爬取方式,再上一篇中着重介绍。
根据主客场对战的上次排名进行比较,将此作为一个特征:
last_match_winner = defaultdict(int)
dataset['HomeTeamWonLast'] = 0
for index, row in dataset.iterrows():
home_team = row['Home Team']
visitor_team = row['Visitor Team']
# 通过字母排序,这样不论谁是主客场队伍,保证不会出现重复
teams = tuple(sorted([home_team, visitor_team]))
row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == row["Home Team"] else 0
dataset.ix[index] = row
winner = row["Home Team"] if row["HomeWin"] else row['Visitor Team']
last_match_winner[teams] = winner
X_lastwinner = dataset[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
为了防止决策树过拟合的现象,还可以采用随机森林的方式,方差会减小。
这里的方差的大小与数据集(训练集)的分割有关系;而偏误与训练集没有关系,与算法有关系,因为,算法都是先假设数据按照正态分布,导致较高的误差。
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
X_all = np.hstack([X_homehigher, X_teams])
cls = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
还可以通过GridSearchCV选取最佳的参数
from sklearn.model_selection import GridSearchCV
parameter_space = {
"max_features":[2,10,'auto'],
"n_estimators":[100,],
"criterion":['gini','entropy'],
'min_samples_leaf':[2,4,6]
}西部联合数据分析
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='entropy', max_depth=None, max_features=2,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,神户胜利船推荐分析预测
n_jobs=None, oob_score=False, random_state=14, verbose=0,
warm_start=False)
以上参考 Robert Layton 的python数据挖掘与数据分析,有兴趣可以看看书。

