ml4 13 Полнота
In [2]:
Copied!
import matplotlib.pyplot as plt
from sklearn import datasets
import numpy as np
from itertools import cycle, islice
n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
cluster_std=[1.4, 1.7],
random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))
import matplotlib.pyplot as plt
from sklearn import datasets
import numpy as np
from itertools import cycle, islice
n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
cluster_std=[1.4, 1.7],
random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))
In [3]:
Copied!
# Визуализируем исходные данные
def plot_scatter():
plt.rcParams['figure.figsize'] = 3, 3
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
plt.scatter(X[:, 0], X[:, 1], s=10,ec='k',alpha=0.25,color=colors[y_pred])
# Визуализируем исходные данные
def plot_scatter():
plt.rcParams['figure.figsize'] = 3, 3
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
plt.scatter(X[:, 0], X[:, 1], s=10,ec='k',alpha=0.25,color=colors[y_pred])
Пример Кластеризации¶
Коэффициент силуэта
можно посчитать при помощи реализации из библиотеки sklearn
In [4]:
Copied!
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import completeness_score
# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
# теперь посчитаем однородность
completeness_score(labels_true=y, labels_pred=y_pred)
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import completeness_score
# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
# теперь посчитаем однородность
completeness_score(labels_true=y, labels_pred=y_pred)
Out[4]:
0.7859676398774584
In [5]:
Copied!
plot_scatter()
plot_scatter()
In [6]:
Copied!
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
model = StandardScaler()
X = model.fit_transform(X)
# Gaussian Mixture
em_gm = GaussianMixture(n_components=3,
random_state=42)
em_gm.fit(X)
y_pred = em_gm.predict(X)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
round(completeness,3)
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
model = StandardScaler()
X = model.fit_transform(X)
# Gaussian Mixture
em_gm = GaussianMixture(n_components=3,
random_state=42)
em_gm.fit(X)
y_pred = em_gm.predict(X)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
round(completeness,3)
Out[6]:
0.933
Задание 4.13.2¶
Сравните результаты кластеризации алгоритмов K-means
, GaussianMixture
, AgglomerativeClustering
и DBSCAN
на исходном датасете при помощи полноты
, инициализируйте алгоритмы со следующими параметрами:
K-means
– n_clusters=3, random_state=42GaussianMixture
– n_components=3, random_state=42AgglomerativeClustering
– n_clusters=3DBSCAN
– eps=0.9, min_samples=35
In [7]:
Copied!
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
model = StandardScaler()
X = model.fit_transform(X)
# K-Means
kmeans = KMeans(n_clusters=3,
random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('kmeans',completeness)
# Gaussian Mixture
em_gm = GaussianMixture(n_components=3,
random_state=42)
em_gm.fit(X)
y_pred = em_gm.predict(X)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('gmm',completeness)
# Agglomerative Cluster
ac = AgglomerativeClustering(n_clusters=3)
ac.fit(X)
y_pred = ac.labels_.astype(np.int)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('agglomerative',completeness)
# DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=35)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('dbscan',completeness)
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
model = StandardScaler()
X = model.fit_transform(X)
# K-Means
kmeans = KMeans(n_clusters=3,
random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('kmeans',completeness)
# Gaussian Mixture
em_gm = GaussianMixture(n_components=3,
random_state=42)
em_gm.fit(X)
y_pred = em_gm.predict(X)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('gmm',completeness)
# Agglomerative Cluster
ac = AgglomerativeClustering(n_clusters=3)
ac.fit(X)
y_pred = ac.labels_.astype(np.int)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('agglomerative',completeness)
# DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=35)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)
completeness = completeness_score(labels_true=y,
labels_pred=y_pred)
print('dbscan',completeness)
kmeans 0.7828224102025245 gmm 0.9325740421656737 agglomerative 0.9058386997451113 dbscan 0.08342237034907717