Mid treemodels

In [1]:

Copied!





import pandas as pd
import numpy as np
import os; print(os.listdir())
import warnings; warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
import os; print(os.listdir())
import warnings; warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

['submission_sample.csv', 'test', 'MTH.ipynb', 'MidTermHack.ipynb', 'train', '.ipynb_checkpoints', 'Untitled.ipynb']

Mid-Term Hackathon¶

Задача¶

Нам кужно определить тип опухоли по измерении и статистик опухоли и использовать нужно методы связаны с деревьями

Загрузка Данных¶

Признаки в разных файлах различаются, все они имеют общие признаки ID, Category
Потребуется проверка пропусков, чтобы уточнить есть какие данные у нас есть для каждого ID
Объеденим данные с помощью concat

In [2]:

Copied!





# Объеденим наши данные
train = pd.concat([pd.read_csv("./train/train" + str(i) + ".csv") for i in range(5)], axis=1)
test = pd.concat([pd.read_csv("./test/test" + str(i) + ".csv") for i in range(5)], axis=1)
train.columns
# Объеденим наши данные
train = pd.concat([pd.read_csv("./train/train" + str(i) + ".csv") for i in range(5)], axis=1)
test = pd.concat([pd.read_csv("./test/test" + str(i) + ".csv") for i in range(5)], axis=1)
train.columns

Out[2]:

Index(['ID', 'Category', 'radius_mean', 'radius_std', 'radius_max',
       'texture_mean', 'texture_std', 'IT', 'Category', 'area_std', 'area_max',
       'smoothness_mean', 'smoothness_std', 'smoothness_max',
       'compactness_mean', 'compactness_std', 'compactness_max',
       'concavity_mean', 'concavity_std', 'concavity_max', 'ID', 'Category',
       'concave_points_mean', 'concave_points_std', 'concave_points_max',
       'symmetry_mean', 'symmetry_std', 'symmetry_max', '1D', 'Category',
       'texture_max', 'perimeter_mean', 'perimeter_std', 'perimeter_max',
       'area_mean', 'ID', 'Category', 'fractal_dimension_mean',
       'fractal_dimension_std', 'fractal_dimension_max'],
      dtype='object')

Пропуски в Данных¶

Посмотрим, есть ли пропуски

In [3]:

Copied!

# Пропусков у нас нет
print('missing train:',train.isna().sum().sum())
print('missing test:',test.isna().sum().sum())
# Пропусков у нас нет
print('missing train:',train.isna().sum().sum())
print('missing test:',test.isna().sum().sum())

missing train: 0
missing test: 0

Удаление признаков¶

Почистим грязные данные и определим целевую переменную Category (int64)

In [4]:

Copied!

X, y = train.drop(['Category', 'ID', '1D', 'IT'], axis=1), train['Category'].iloc[:, 0].astype(int)

model = StandardScaler()
X_scaled = model.fit_transform(X)
X.describe()
X, y = train.drop(['Category', 'ID', '1D', 'IT'], axis=1), train['Category'].iloc[:, 0].astype(int)

model = StandardScaler()
X_scaled = model.fit_transform(X)
X.describe()

Out[4]:

	radius_mean	radius_std	radius_max	texture_mean	texture_std	area_std	area_max	smoothness_mean	smoothness_std	smoothness_max	...	symmetry_std	symmetry_max	texture_max	perimeter_mean	perimeter_std	perimeter_max	area_mean	fractal_dimension_mean	fractal_dimension_std	fractal_dimension_max
count	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	...	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000	284.000000
mean	14.148996	19.495528	92.026021	658.107394	0.096198	0.419343	1.251892	2.919105	42.442504	0.007101	...	0.252722	0.269386	0.103987	0.087984	0.049109	0.180905	0.062776	0.114157	0.289535	0.083963
std	3.571511	4.234565	24.513772	357.663036	0.014897	0.309239	0.578635	2.235951	53.575721	0.002999	...	0.157284	0.204846	0.050364	0.076533	0.038963	0.027561	0.007116	0.066556	0.062742	0.018969
min	6.981000	10.720000	43.790000	143.500000	0.062510	0.114400	0.360200	0.771400	7.254000	0.001713	...	0.027290	0.000000	0.019380	0.000000	0.000000	0.106000	0.050250	0.000000	0.156500	0.055040
25%	11.667500	16.947500	74.967500	416.950000	0.085140	0.238025	0.849125	1.665500	18.117500	0.005181	...	0.154075	0.120425	0.066757	0.029555	0.020850	0.160575	0.057495	0.065430	0.251075	0.071843
50%	13.505000	19.075000	87.265000	559.200000	0.095410	0.331250	1.177500	2.296000	24.610000	0.006464	...	0.211450	0.230600	0.095160	0.065830	0.035125	0.178250	0.061735	0.096315	0.279650	0.079505
75%	16.085000	21.592500	105.925000	799.100000	0.105400	0.508425	1.508500	3.363750	48.442500	0.008279	...	0.330150	0.374775	0.130000	0.122225	0.070445	0.196400	0.066455	0.161600	0.314800	0.091895
max	28.110000	39.280000	188.500000	2501.000000	0.163400	2.873000	4.885000	21.980000	542.200000	0.021770	...	1.058000	1.105000	0.283200	0.426400	0.182300	0.304000	0.095750	0.291000	0.555800	0.207500

8 rows × 30 columns

Расспределение классов в целевой перемене¶

Посмотрим на расспределение целевой метки

In [5]:

Copied!

y.value_counts()
y.value_counts()

Out[5]:

0    178
1    106
Name: Category, dtype: int64

EDA¶

Возьмем признаки которые мы предпологаем будут полезные
После этого посмотрим на корреляцию, интуитивно использовать .bar в dataframe

In [6]:

Copied!





best_features_ever = ['radius_max', 'perimeter_mean', 'perimeter_std', 'smoothness_mean',
                      'concave_points_max', 'compactness_std', 'symmetry_mean', 
                      'symmetry_max', 'compactness_mean', 'concavity_std']

corr = train.loc[:, best_features_ever].corr()
corr = corr * abs(np.eye(len(corr))-1)
# corr = corr.astype('float').round()
corr.style\
    .format("{:.3}")\
    .bar(align='mid', color=['#d65f5f', '#5fba7d'])
best_features_ever = ['radius_max', 'perimeter_mean', 'perimeter_std', 'smoothness_mean',
                      'concave_points_max', 'compactness_std', 'symmetry_mean', 
                      'symmetry_max', 'compactness_mean', 'concavity_std']

corr = train.loc[:, best_features_ever].corr()
corr = corr * abs(np.eye(len(corr))-1)
# corr = corr.astype('float').round()
corr.style\
    .format("{:.3}")\
    .bar(align='mid', color=['#d65f5f', '#5fba7d'])

Out[6]:

	radius_max	perimeter_mean	perimeter_std	smoothness_mean	concave_points_max	compactness_std	symmetry_mean	symmetry_max	compactness_mean	concavity_std
radius_max	0.0	0.754	0.85	0.71	0.938	0.311	0.126	0.586	0.201	-0.0481
perimeter_mean	0.754	0.0	0.937	0.662	0.717	0.714	0.464	0.877	0.599	0.366
perimeter_std	0.85	0.937	0.0	0.693	0.802	0.518	0.449	0.753	0.444	0.233
smoothness_mean	0.71	0.662	0.693	0.0	0.744	0.399	0.108	0.384	0.305	0.148
concave_points_max	0.938	0.717	0.802	0.744	0.0	0.272	0.195	0.566	0.164	-0.0531
compactness_std	0.311	0.714	0.518	0.399	0.272	0.0	0.243	0.701	0.875	0.669
symmetry_mean	0.126	0.464	0.449	0.108	0.195	0.243	0.0	0.545	0.251	0.253
symmetry_max	0.586	0.877	0.753	0.384	0.566	0.701	0.545	0.0	0.595	0.338
compactness_mean	0.201	0.599	0.444	0.305	0.164	0.875	0.251	0.595	0.0	0.795
concavity_std	-0.0481	0.366	0.233	0.148	-0.0531	0.669	0.253	0.338	0.795	0.0

In [7]:

Copied!





data = pd.concat([train.loc[:, best_features_ever], pd.Series(y, name='y')], axis=1)
g = sns.PairGrid(data[[data.columns[1],data.columns[2],data.columns[3],
                       data.columns[4], data.columns[5], 'y']], hue='y',
                 height=3)
g = g.map_diag(sns.histplot,edgecolor='k')
g = g.map_offdiag(plt.scatter, s = 20,linewidths=0.5, edgecolor='k')
data = pd.concat([train.loc[:, best_features_ever], pd.Series(y, name='y')], axis=1)
g = sns.PairGrid(data[[data.columns[1],data.columns[2],data.columns[3],
                       data.columns[4], data.columns[5], 'y']], hue='y',
                 height=3)
g = g.map_diag(sns.histplot,edgecolor='k')
g = g.map_offdiag(plt.scatter, s = 20,linewidths=0.5, edgecolor='k')

No description has been provided for this image

In [9]:

Copied!





import warnings; warnings.filterwarnings('ignore')

fig, ax = plt.subplots(2, 5, figsize=(12,4))
axes = ax.ravel()
for i, feature in enumerate(best_features_ever):
    sns.distplot(data[data["y"] == 1][feature], label="1", hist_kws=dict(edgecolor=None, linewidth=1),ax=axes[i], color='r')
    sns.distplot(data[data["y"] == 0][feature], label="0", hist_kws=dict(edgecolor=None, linewidth=1),ax=axes[i])
    axes[i].legend()
plt.tight_layout()
import warnings; warnings.filterwarnings('ignore')

fig, ax = plt.subplots(2, 5, figsize=(12,4))
axes = ax.ravel()
for i, feature in enumerate(best_features_ever):
    sns.distplot(data[data["y"] == 1][feature], label="1", hist_kws=dict(edgecolor=None, linewidth=1),ax=axes[i], color='r')
    sns.distplot(data[data["y"] == 0][feature], label="0", hist_kws=dict(edgecolor=None, linewidth=1),ax=axes[i])
    axes[i].legend()
plt.tight_layout()

Модель : Baseline¶

Для базовой модель используем деревье решении, используем базовые настройки модели
Разделяем выборку на тренировочную и тестовую подвыборку с соотношением 1/10

In [31]:

Copied!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
f1_score(dt.predict(X_test), y_test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
f1_score(dt.predict(X_test), y_test)

Будем использовать GridSearchCV c 10 фолдами, из parameter_grid определим какая комбинация параметров даст нам наилучшию модель

In [ ]:

Copied!





def optimize_model(model, param_grid):
    grid_search = GridSearchCV(model,
                              param_grid=param_grid,
                              cv=10)
    grid_search.fit(X_scaled, y)
    print("Best Score: {}".format(grid_search.best_score_))
    print("Best params: {}".format(grid_search.best_params_))
    return grid_search.best_estimator_
def optimize_model(model, param_grid):
    grid_search = GridSearchCV(model,
                              param_grid=param_grid,
                              cv=10)
    grid_search.fit(X_scaled, y)
    print("Best Score: {}".format(grid_search.best_score_))
    print("Best params: {}".format(grid_search.best_params_))
    return grid_search.best_estimator_

Модель : RandomForest¶

Теперь пробуем используем модель которая использует RSS + Booststrap Agreggating енсембл; случайный лес
Будем использовать GridSearchCV c 10 фолдами, из parameter_grid определим какая комбинация параметров даст нам наилучшию модель

In [10]:

Copied!





parameter_grid = {'n_estimators': [20, 25, 30],
                  'max_depth': np.logspace(0,2,10),
                  'max_features': list(range(1,11))}
rf = optimize_model(RandomForestClassifier(), parameter_grid)
parameter_grid = {'n_estimators': [20, 25, 30],
                  'max_depth': np.logspace(0,2,10),
                  'max_features': list(range(1,11))}
rf = optimize_model(RandomForestClassifier(), parameter_grid)

Best Score: 0.9614532019704433
Best params: {'max_depth': 7.742636826811269, 'max_features': 2, 'n_estimators': 30}

In [11]:

Copied!





rf_test = RandomForestClassifier(**{'max_depth': 7.742636826811269, 
                                    'max_features': 10, 'n_estimators': 20, 'warm_start': True})
scaler = StandardScaler()
rf_test.fit(scaler.fit_transform(X_train), y_train)
rf_test = RandomForestClassifier(**{'max_depth': 7.742636826811269, 
                                    'max_features': 10, 'n_estimators': 20, 'warm_start': True})
scaler = StandardScaler()
rf_test.fit(scaler.fit_transform(X_train), y_train)

Out[11]:

RandomForestClassifier(max_depth=7.742636826811269, max_features=10,
                       n_estimators=20, warm_start=True)

In [37]:

Copied!

y_pred = (rf_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)
y_pred = (rf_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)

Out[37]:

1.0

In [38]:

Copied!

confusion_matrix(y_pred, y_test)
confusion_matrix(y_pred, y_test)

Out[38]:

array([[21,  0],
       [ 0,  8]])

Модель : Градиентного Бустинга¶

далее рассмотрим градиентнй бустинг

In [39]:

Copied!





parameter_grid = {'n_estimators': [30, 50],
                  'max_depth': np.logspace(0,2,10),
                  'max_features': list(range(1,11)),
                  'learning_rate': np.logspace(0.001, 1, 10)}

gb = optimize_model(GradientBoostingClassifier(), 
                    parameter_grid)
parameter_grid = {'n_estimators': [30, 50],
                  'max_depth': np.logspace(0,2,10),
                  'max_features': list(range(1,11)),
                  'learning_rate': np.logspace(0.001, 1, 10)}

gb = optimize_model(GradientBoostingClassifier(), 
                    parameter_grid)

Best Score: 0.9720443349753694
Best params: {'learning_rate': 1.0023052380778996, 'max_depth': 2.7825594022071245, 'max_features': 9, 'n_estimators': 50}

In [40]:

Copied!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [41]:

Copied!





gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                                     'max_depth': 2.7825594022071245, 
                                                     'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)
gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                                     'max_depth': 2.7825594022071245, 
                                                     'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)

Out[41]:

GradientBoostingClassifier(learning_rate=1.0023052380778996,
                           max_depth=2.7825594022071245, max_features=7,
                           n_estimators=30)

In [12]:

Copied!

y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)
y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)

Out[12]:

0.9508196721311476

In [13]:

Copied!

confusion_matrix(y_pred, y_test)
confusion_matrix(y_pred, y_test)

Out[13]:

array([[78,  5],
       [ 1, 58]])

Подбор Фичей¶

SelectKBest¶

Подбор самых важных фичей по методу SelectKBest с метрикой chi2

In [14]:

Copied!

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectKBest, chi2

In [15]:

Copied!





bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] 
best_chi2_feature_names = list(featureScores.nlargest(10,'Score')['Specs'])
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] 
best_chi2_feature_names = list(featureScores.nlargest(10,'Score')['Specs'])

In [16]:

Copied!

best_chi2_feature_names
best_chi2_feature_names

Out[16]:

['concave_points_max',
 'texture_mean',
 'smoothness_std',
 'concave_points_std',
 'radius_max',
 'concavity_max',
 'smoothness_mean',
 'radius_mean',
 'concave_points_mean',
 'radius_std']

In [17]:

Copied!

X_train, X_test, y_train, y_test = train_test_split(X.loc[:, best_chi2_feature_names], y, test_size=0.1)
X_train, X_test, y_train, y_test = train_test_split(X.loc[:, best_chi2_feature_names], y, test_size=0.1)

In [18]:

Copied!





gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)
gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)

Out[18]:

GradientBoostingClassifier(learning_rate=1.0023052380778996,
                           max_depth=2.7825594022071245, max_features=7,
                           n_estimators=30)

In [19]:

Copied!

y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)
y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)

Out[19]:

0.9473684210526316

In [20]:

Copied!

confusion_matrix(y_pred, y_test)
confusion_matrix(y_pred, y_test)

Out[20]:

array([[19,  1],
       [ 0,  9]])

featureimportance_¶

Ииспользуя важность признаков по методу градиентного бустинга

In [21]:

Copied!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [22]:

Copied!





gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)
gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)

Out[22]:

GradientBoostingClassifier(learning_rate=1.0023052380778996,
                           max_depth=2.7825594022071245, max_features=7,
                           n_estimators=30)

In [23]:

Copied!

feat_importances = pd.Series(gb_test.feature_importances_, index=X.columns)
best_tree_feature_names = list(feat_importances.nlargest(10).index)
feat_importances = pd.Series(gb_test.feature_importances_, index=X.columns)
best_tree_feature_names = list(feat_importances.nlargest(10).index)

In [24]:

Copied!

best_tree_feature_names
best_tree_feature_names

Out[24]:

['concave_points_std',
 'perimeter_std',
 'area_max',
 'smoothness_std',
 'symmetry_max',
 'concavity_max',
 'radius_std',
 'fractal_dimension_max',
 'symmetry_std',
 'radius_max']

In [25]:

Copied!

X_train, X_test, y_train, y_test = train_test_split(X.loc[:, best_tree_feature_names], y, test_size=0.5)
X_train, X_test, y_train, y_test = train_test_split(X.loc[:, best_tree_feature_names], y, test_size=0.5)

In [26]:

Copied!





gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)
gb_test = GradientBoostingClassifier(**{'learning_rate': 1.0023052380778996, 
                                        'max_depth': 2.7825594022071245, 
                                        'max_features': 7, 'n_estimators': 30})
scaler = StandardScaler()
gb_test.fit(scaler.fit_transform(X_train), y_train)

Out[26]:

GradientBoostingClassifier(learning_rate=1.0023052380778996,
                           max_depth=2.7825594022071245, max_features=7,
                           n_estimators=30)

In [27]:

Copied!

y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)
y_pred = (gb_test.predict_proba(scaler.transform(X_test)) > 0.5).astype(int)[:, 1]
f1_score(y_pred, y_test)

Out[27]:

0.923076923076923

In [28]:

Copied!

confusion_matrix(y_pred, y_test)
confusion_matrix(y_pred, y_test)

Out[28]:

array([[86,  5],
       [ 3, 48]])