#25编程题
题目:基于 Scikit-learn 实现乳腺癌数据集预测 pipeline
要求:补全代码,完成从加载数据、划分数据集、数据归一化到模型网格搜索寻找最优 $K$ 值的
完整流程。
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
X, y = data.data, data.target
# 2. 划分数据集 (测试集占比 20%, 不洗牌以符合材料示例)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 , shuffle= False )
# 3. 数据归一化 (请使用均值方差归一化)
scaler = # [补全代码]
# 计算训练集均值标准差并转换
X_train_scaled = scaler.fit_transform(X_train)
# 使用相同参数转换测试集
X_test_scaled = # [补全代码]
# 4. 设置网格搜索参数 (K值范围: 1-10; 权重: uniform, distance)
param_grid = [
{
'n_neighbors' : [i for i in range ( 1 , 11 )],
'weights' : # [补全代码]
}
]
# 5. 执行网格搜索与交叉验证 (使用 KNeighborsClassifier)
knn_clf = KNeighborsClassifier()
grid_search = # [补全代码] (estimator=knn_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
# 6. 输出结果
print( "最优参数:" , grid_search.best_params_)
print( "最优得分:" , grid_search.best_score_)
print( "测试集准确率:" , grid_search.score(X_test_scaled, y_test))