1. 配置
導(dǎo)入模塊。
查看代碼
#general
import io
# data
import numpy as np
import pandas as pd
# machine learning
import keras
# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
# 博客園引流
from Keras備忘錄 import build_model
加載數(shù)據(jù)集
查看代碼
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")
讀取數(shù)據(jù)集
Read dataset
# Updates dataframe to use specific columns.
training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]
print('Read dataset completed successfully.')
print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
training_df.head(200)
2. 訓(xùn)練和預(yù)測,結(jié)果可視化
定義繪圖函數(shù)(注意:此處plot_model函數(shù)對輸入特征為二維時的繪制方法似乎是錯的,有獎?wù)堊x者斧正)
可視化plotting functions
def make_plots(df, feature_names, label_name, model_output, sample_size=200):
random_sample = df.sample(n=sample_size).copy()
random_sample.reset_index()
weights, bias, epochs, rmse = model_output
is_2d_plot = len(feature_names) == 1
model_plot_type = "scatter" if is_2d_plot else "surface"
fig = make_subplots(rows=1, cols=2,
subplot_titles=("Loss Curve", "Model Plot"),
specs=[[{"type": "scatter"}, {"type": model_plot_type}]])
plot_data(random_sample, feature_names, label_name, fig)
plot_model(random_sample, feature_names, weights, bias, fig)
plot_loss_curve(epochs, rmse, fig)
fig.show()
return
def plot_loss_curve(epochs, rmse, fig):
curve = px.line(x=epochs, y=rmse)
curve.update_traces(line_color='#ff0000', line_width=3)
fig.append_trace(curve.data[0], row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])
return
def plot_data(df, features, label, fig):
if len(features) == 1:
scatter = px.scatter(df, x=features[0], y=label)
else:
scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)
fig.append_trace(scatter.data[0], row=1, col=2)
if len(features) == 1:
fig.update_xaxes(title_text=features[0], row=1, col=2)
fig.update_yaxes(title_text=label, row=1, col=2)
else:
fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))
return
def plot_model(df, features, weights, bias, fig):
df['FARE_PREDICTED'] = bias[0]
for index, feature in enumerate(features):
df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]
if len(features) == 1:
model = px.line(df, x=features[0], y='FARE_PREDICTED')
model.update_traces(line_color='#ff0000', line_width=3)
else:
z_name, y_name = "FARE_PREDICTED", features[1]
z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
x = []
for i in range(len(y)):
x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])
plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})
light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
colorscale=light_yellow))
fig.add_trace(model.data[0], row=1, col=2)
return
def model_info(feature_names, label_name, model_output):
weights = model_output[0]
bias = model_output[1]
nl = "\n"
header = "-" * 80
banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header
info = ""
equation = label_name + " = "
for index, feature in enumerate(feature_names):
info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)
info = info + "Bias: {:.3f}\n".format(bias[0])
equation = equation + "{:.3f}\n".format(bias[0])
return banner + nl + info + nl + equation
print("SUCCESS: defining plotting functions complete.")
定義訓(xùn)練函數(shù)
訓(xùn)練
def train_model(model, df, features, label, epochs, batch_size):
"""Train the model by feeding it data."""
# Feed the model the feature and the label.
# The model will train for the specified number of epochs.
# input_x = df.iloc[:,1:3].values
# df[feature]
history = model.fit(x=features,
y=label,
batch_size=batch_size,
epochs=epochs)
# Gather the trained model's weight and bias.
trained_weight = model.get_weights()[0]
trained_bias = model.get_weights()[1]
# The list of epochs is stored separately from the rest of history.
epochs = history.epoch
# Isolate the error for each epoch.
hist = pd.DataFrame(history.history)
# To track the progression of training, we're going to take a snapshot
# of the model's root mean squared error at each epoch.
rmse = hist["root_mean_squared_error"]
return trained_weight, trained_bias, epochs, rmse
定義實驗函數(shù),也可直接作為主函數(shù)。
實驗
def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):
print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))
num_features = len(feature_names)
features = df.loc[:, feature_names].values
label = df[label_name].values
model = build_model(learning_rate, num_features)
model_output = train_model(model, df, features, label, epochs, batch_size)
print('\nSUCCESS: training experiment complete\n')
print('{}'.format(model_info(feature_names, label_name, model_output)))
make_plots(df, feature_names, label_name, model_output)
return model
3. 開展實驗
先用一種特征:
單特征
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 10
batch_size = 50
# Specify the feature and the label.
features = ['TRIP_MILES']
label = 'FARE'
model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)
此時可以修改學(xué)習率到1或原來的一成或,或批次大小為原來的十倍,查看超參數(shù)的調(diào)整的影響。
再嘗試用兩種特征'TRIP_MILES'和'TRIP_MINUTES'(注意:若使用兩種特征,則它們大小最好在同一量級,所以此處將訓(xùn)練集原始的特征'TRIP_SECONDS'轉(zhuǎn)換成分鐘數(shù)作為新特征訓(xùn)練),只需修改上述程序的部分:
修改部分
training_df['TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60
features = ['TRIP_MILES', 'TRIP_MINUTES']
訓(xùn)練完后,采用訓(xùn)練集的隨即部分樣本(非標準操作,僅供學(xué)習模型預(yù)測步驟使用)做出預(yù)測。先定義預(yù)測函數(shù)
預(yù)測函數(shù)
def format_currency(x):
return "${:.2f}".format(x)
def build_batch(df, batch_size):
batch = df.sample(n=batch_size).copy()
batch.set_index(np.arange(batch_size), inplace=True)
return batch
def predict_fare(model, df, features, label, batch_size=50):
batch = build_batch(df, batch_size)
predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)
data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
features[0]: [], features[1]: []}
for i in range(batch_size):
predicted = predicted_values[i][0]
observed = batch.at[i, label]
data["PREDICTED_FARE"].append(format_currency(predicted))
data["OBSERVED_FARE"].append(format_currency(observed))
data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
data[features[0]].append(batch.at[i, features[0]])
data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))
output_df = pd.DataFrame(data)
return output_df
def show_predictions(output):
header = "-" * 80
banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
print(banner)
print(output)
return
然后作出預(yù)測:
output = predict_fare(model_2, training_df, features, label)
show_predictions(output)
4. 鳴謝
Google開發(fā)者中心。
浙公網(wǎng)安備 33010602011771號