Pytorch實現二分類問題
In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
In [2]:
data = pd.read_csv('./HR.csv')
data.head(10)
| satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | part | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
| 5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 1 | 0 | sales | low |
| 6 | 0.10 | 0.77 | 6 | 247 | 4 | 0 | 1 | 0 | sales | low |
| 7 | 0.92 | 0.85 | 5 | 259 | 5 | 0 | 1 | 0 | sales | low |
| 8 | 0.89 | 1.00 | 5 | 224 | 5 | 0 | 1 | 0 | sales | low |
| 9 | 0.42 | 0.53 | 2 | 142 | 3 | 0 | 1 | 0 | sales | low |
In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14999 entries, 0 to 14998 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction_level 14999 non-null float64 1 last_evaluation 14999 non-null float64 2 number_project 14999 non-null int64 3 average_montly_hours 14999 non-null int64 4 time_spend_company 14999 non-null int64 5 Work_accident 14999 non-null int64 6 left 14999 non-null int64 7 promotion_last_5years 14999 non-null int64 8 part 14999 non-null object 9 salary 14999 non-null object dtypes: float64(2), int64(6), object(2) memory usage: 1.1+ MB
In [4]:
data.part.unique()
Out[4]:
array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)
In [5]:
data = data.join(pd.get_dummies(data.part)).join(pd.get_dummies(data.salary))
data.drop(columns=['part', 'salary'], inplace=True)
data
| satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | IT | RandD | ... | hr | management | marketing | product_mng | sales | support | technical | high | low | medium | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | False | True |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | False | True |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14994 | 0.40 | 0.57 | 2 | 151 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
| 14995 | 0.37 | 0.48 | 2 | 160 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
| 14996 | 0.37 | 0.53 | 2 | 143 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
| 14997 | 0.11 | 0.96 | 6 | 280 | 4 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
| 14998 | 0.37 | 0.52 | 2 | 158 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14999 rows × 21 columns
In [6]:
data.left.value_counts()
Out[6]:
left 0 11428 1 3571 Name: count, dtype: int64
In [7]:
Y_data = data.left.values.reshape(-1, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
In [8]:
data = data.drop(columns='left')
X_data = data.values.astype(float)
X = torch.from_numpy(X_data).type(torch.FloatTensor)
In [9]:
class HRModel(nn.Module):
def __init__(self):
super().__init__()
self.lin_1 = nn.Linear(20, 64)
self.lin_2 = nn.Linear(64, 64)
self.lin_3 = nn.Linear(64, 1)
self.activate = nn.SELU()
self.sigmoid = nn.Sigmoid()
def forward(self, input):
x = self.lin_1(input)
x = self.activate(x)
x = self.lin_2(x)
x = self.activate(x)
x = self.lin_3(x)
return self.sigmoid(x)
In [10]:
lr = 0.001
model = HRModel()
opt = torch.optim.Adam(model.parameters(), lr=lr)
batch_size = 64
steps = len(data) // batch_size
epochs = 501
loss_fn = nn.BCELoss()
In [11]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)
In [12]:
def accuracy(out, yb):
return ((out.data.numpy()>0.5)==yb.numpy()).mean()
In [13]:
%%time
for epoch in range(epochs):
model.train()
for xb, yb in train_dl:
pred = model(xb)
loss = loss_fn(pred, yb)
opt.zero_grad()
loss.backward()
opt.step()
if epoch%50 == 0:
model.eval()
with torch.no_grad():
valid_loss = sum([ loss_fn(model(x), y) for x, y in test_dl ])
acc_mean = np.mean([accuracy(model(x), y) for x, y in test_dl])
print('訓練次數:', epoch, ' 損失:', valid_loss/len(test_dl), ' 準確率:', acc_mean)
訓練次數: 0 損失: tensor(0.5407) 準確率: 0.7669770294380017 訓練次數: 50 損失: tensor(0.1423) 準確率: 0.9626588983050848 訓練次數: 100 損失: tensor(0.1460) 準確率: 0.9528462310437109 訓練次數: 150 損失: tensor(0.1282) 準確率: 0.9607911462979483 訓練次數: 200 損失: tensor(0.1150) 準確率: 0.9667986173059768 訓練次數: 250 損失: tensor(0.1369) 準確率: 0.9555084745762712 訓練次數: 300 損失: tensor(0.1175) 準確率: 0.96671498661909 訓練次數: 350 損失: tensor(0.1191) 準確率: 0.9638018510258698 訓練次數: 400 損失: tensor(0.1176) 準確率: 0.964861173059768 訓練次數: 450 損失: tensor(0.1346) 準確率: 0.9592161016949152 訓練次數: 500 損失: tensor(0.1246) 準確率: 0.9661016949152542 CPU times: total: 1min 48s Wall time: 1min 38s

浙公網安備 33010602011771號