5344 lines
246 KiB
Plaintext
5344 lines
246 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 环境配置与Utils"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# # %%bash\n",
|
||
"# rm -rf /kaggle/working/nejm-brain-to-text/\n",
|
||
"# git clone https://github.com/ZH-CEN/nejm-brain-to-text.git\n",
|
||
"# cp /kaggle/input/brain-to-text-baseline-model/t15_copyTask.pkl /kaggle/working/nejm-brain-to-text/data/t15_copyTask.pkl\n",
|
||
"\n",
|
||
"# ln -s /kaggle/input/brain-to-text-25/t15_pretrained_rnn_baseline/t15_pretrained_rnn_baseline /kaggle/working/nejm-brain-to-text/data\n",
|
||
"# ln -s /kaggle/input/brain-to-text-25/t15_copyTask_neuralData/hdf5_data_final /kaggle/working/nejm-brain-to-text/data\n",
|
||
"# ln -s /kaggle/input/rnn-pretagged-data /kaggle/working/nejm-brain-to-text/data/concatenated_data\n",
|
||
"\n",
|
||
"# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126\n",
|
||
"\n",
|
||
"# pip install \\\n",
|
||
"# jupyter==1.1.1 \\\n",
|
||
"# \"numpy>=1.26.0,<2.1.0\" \\\n",
|
||
"# pandas==2.3.0 \\\n",
|
||
"# matplotlib==3.10.1 \\\n",
|
||
"# scipy==1.15.2 \\\n",
|
||
"# scikit-learn==1.6.1 \\\n",
|
||
"# lightgbm==4.3.0 \\\n",
|
||
"# tqdm==4.67.1 \\\n",
|
||
"# g2p_en==2.1.0 \\\n",
|
||
"# h5py==3.13.0 \\\n",
|
||
"# omegaconf==2.3.0 \\\n",
|
||
"# editdistance==0.8.1 \\\n",
|
||
"# huggingface-hub==0.33.1 \\\n",
|
||
"# transformers==4.53.0 \\\n",
|
||
"# tokenizers==0.21.2 \\\n",
|
||
"# accelerate==1.8.1 \\\n",
|
||
"# bitsandbytes==0.46.0 \\\n",
|
||
"# seaborn==0.13.2\n",
|
||
"# cd /kaggle/working/nejm-brain-to-text/\n",
|
||
"# pip install -e ."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"==================================================\n",
|
||
"🔧 LightGBM GPU环境检查\n",
|
||
"==================================================\n",
|
||
"❌ 未检测到NVIDIA GPU或驱动\n",
|
||
"\n",
|
||
"❌ 未安装CUDA工具包\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🚀 LightGBM GPU支持检查与配置\n",
|
||
"\n",
|
||
"print(\"=\"*50)\n",
|
||
"print(\"🔧 LightGBM GPU环境检查\")\n",
|
||
"print(\"=\"*50)\n",
|
||
"\n",
|
||
"# 检查CUDA和GPU驱动\n",
|
||
"import subprocess\n",
|
||
"import sys\n",
|
||
"\n",
|
||
"def run_command(command):\n",
|
||
" \"\"\"运行命令并返回结果\"\"\"\n",
|
||
" try:\n",
|
||
" result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=10)\n",
|
||
" return result.stdout.strip(), result.returncode == 0\n",
|
||
" except Exception as e:\n",
|
||
" return str(e), False\n",
|
||
"\n",
|
||
"# 检查NVIDIA GPU\n",
|
||
"nvidia_output, nvidia_success = run_command(\"nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader,nounits\")\n",
|
||
"if nvidia_success:\n",
|
||
" print(\"✅ NVIDIA GPU检测:\")\n",
|
||
" for line in nvidia_output.split('\\n'):\n",
|
||
" if line.strip():\n",
|
||
" print(f\" {line}\")\n",
|
||
"else:\n",
|
||
" print(\"❌ 未检测到NVIDIA GPU或驱动\")\n",
|
||
"\n",
|
||
"# 检查CUDA版本\n",
|
||
"cuda_output, cuda_success = run_command(\"nvcc --version\")\n",
|
||
"if cuda_success:\n",
|
||
" print(\"\\n✅ CUDA工具包:\")\n",
|
||
" # 提取CUDA版本\n",
|
||
" for line in cuda_output.split('\\n'):\n",
|
||
" if 'release' in line:\n",
|
||
" print(f\" {line.strip()}\")\n",
|
||
"else:\n",
|
||
" print(\"\\n❌ 未安装CUDA工具包\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# %cd /kaggle/working/nejm-brain-to-text\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import pickle\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import matplotlib\n",
|
||
"from g2p_en import G2p\n",
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"from nejm_b2txt_utils.general_utils import *\n",
|
||
"matplotlib.rcParams['pdf.fonttype'] = 42\n",
|
||
"matplotlib.rcParams['ps.fonttype'] = 42\n",
|
||
"matplotlib.rcParams['font.family'] = 'sans-serif'\n",
|
||
"matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'sans-serif']\n",
|
||
"matplotlib.rcParams['axes.unicode_minus'] = False\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\IPython\\core\\magics\\osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
|
||
" self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\model_training\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%cd ../model_training/\n",
|
||
"from data_augmentations import gauss_smooth"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"LOGIT_TO_PHONEME = [\n",
|
||
" 'BLANK',\n",
|
||
" 'AA', 'AE', 'AH', 'AO', 'AW',\n",
|
||
" 'AY', 'B', 'CH', 'D', 'DH',\n",
|
||
" 'EH', 'ER', 'EY', 'F', 'G',\n",
|
||
" 'HH', 'IH', 'IY', 'JH', 'K',\n",
|
||
" 'L', 'M', 'N', 'NG', 'OW',\n",
|
||
" 'OY', 'P', 'R', 'S', 'SH',\n",
|
||
" 'T', 'TH', 'UH', 'UW', 'V',\n",
|
||
" 'W', 'Y', 'Z', 'ZH',\n",
|
||
" ' | ',\n",
|
||
"]\n",
|
||
"# 全局配置\n",
|
||
"BALANCE_CONFIG = {\n",
|
||
" 'enable_balance': True, # 是否启用数据平衡\n",
|
||
" 'undersample_labels': [0, 40], # 需要下采样的标签 (blank等高频标签)\n",
|
||
" 'oversample_threshold': 0.5, # 过采样阈值 (相对于均值的比例)\n",
|
||
" 'random_state': 42 # 随机种子\n",
|
||
"}\n",
|
||
"# 全局PCA配置\n",
|
||
"PCA_CONFIG = {\n",
|
||
" 'enable_pca': True, # 是否启用PCA\n",
|
||
" 'n_components': None, # None=自动选择, 或指定具体数值\n",
|
||
" 'variance_threshold': 0.95, # 保留95%的方差\n",
|
||
" 'sample_size': 15000, # 用于拟合PCA的样本数\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 全局PCA对象 (确保只拟合一次)\n",
|
||
"GLOBAL_PCA = {\n",
|
||
" 'scaler': None,\n",
|
||
" 'pca': None,\n",
|
||
" 'is_fitted': False,\n",
|
||
" 'n_components': None\n",
|
||
"}\n",
|
||
"# 设置数据目录和参数【PCA初始化】\n",
|
||
"data_dir = '../data/concatenated_data'\n",
|
||
"MAX_SAMPLES_PER_FILE = -1 # 每个文件最大样本数,可调整"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 数据读取工作流"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 2️⃣ 数据加载与PCA降维"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 🚀 内存友好的数据读取 - 分批加载策略 + PCA降维 【这里还缺一个采样】\n",
|
||
"\n",
|
||
"import os\n",
|
||
"import numpy as np\n",
|
||
"import gc\n",
|
||
"from sklearn.decomposition import PCA\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"import joblib\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"\n",
|
||
"def load_data_batch(data_dir, data_type, max_samples_per_file=5000):\n",
|
||
" \"\"\"\n",
|
||
" 分批加载指定类型的数据\n",
|
||
" \n",
|
||
" Args:\n",
|
||
" data_dir: 数据目录\n",
|
||
" data_type: 'train', 'val', 'test'\n",
|
||
" max_samples_per_file: 每个文件最大加载样本数\n",
|
||
" \n",
|
||
" Returns:\n",
|
||
" generator: 数据批次生成器\n",
|
||
" \"\"\"\n",
|
||
" files = [f for f in os.listdir(data_dir) if f.endswith('.npz') and data_type in f]\n",
|
||
" \n",
|
||
" for file_idx, f in enumerate(files):\n",
|
||
" print(f\" 正在加载文件 {file_idx+1}/{len(files)}: {f}\")\n",
|
||
" \n",
|
||
" data = np.load(os.path.join(data_dir, f), allow_pickle=True)\n",
|
||
" trials = data['neural_logits_concatenated']\n",
|
||
" \n",
|
||
" # 限制每个文件的样本数\n",
|
||
" if len(trials) > max_samples_per_file and max_samples_per_file != -1:\n",
|
||
" trials = trials[:max_samples_per_file]\n",
|
||
" print(f\" 限制样本数至: {max_samples_per_file}\")\n",
|
||
" \n",
|
||
" yield trials, f\n",
|
||
" \n",
|
||
" # 清理内存\n",
|
||
" del data, trials\n",
|
||
" gc.collect()\n",
|
||
"\n",
|
||
"def extract_features_labels_batch(trials_batch):\n",
|
||
" \"\"\"\n",
|
||
" 从试验批次中提取特征和标签\n",
|
||
" \"\"\"\n",
|
||
" features = []\n",
|
||
" labels = []\n",
|
||
" \n",
|
||
" for trial in trials_batch:\n",
|
||
" if trial.shape[0] > 0:\n",
|
||
" for t in range(trial.shape[0]):\n",
|
||
" neural_features = trial[t, :7168] # 前7168维神经特征\n",
|
||
" rnn_logits = trial[t, 7168:] # 后41维RNN输出\n",
|
||
" phoneme_label = np.argmax(rnn_logits)\n",
|
||
" features.append(neural_features)\n",
|
||
" labels.append(phoneme_label)\n",
|
||
" \n",
|
||
" return np.array(features), np.array(labels)\n",
|
||
"\n",
|
||
"def fit_global_pca(data_dir, config):\n",
|
||
" \"\"\"\n",
|
||
" 在训练数据上拟合全局PCA (只执行一次)\n",
|
||
" \"\"\"\n",
|
||
" if GLOBAL_PCA['is_fitted'] or not config['enable_pca']:\n",
|
||
" print(\"🔧 PCA已拟合或未启用,跳过拟合步骤\")\n",
|
||
" return\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 拟合全局PCA降维器...\")\n",
|
||
" print(f\" 配置: {config}\")\n",
|
||
" \n",
|
||
" # 收集训练样本\n",
|
||
" sample_features = []\n",
|
||
" collected_samples = 0\n",
|
||
" \n",
|
||
" for trials_batch, filename in load_data_batch(data_dir, 'train', 5000):\n",
|
||
" features, labels = extract_features_labels_batch(trials_batch)\n",
|
||
" sample_features.append(features)\n",
|
||
" collected_samples += features.shape[0]\n",
|
||
" \n",
|
||
" if collected_samples >= config['sample_size']:\n",
|
||
" break\n",
|
||
" \n",
|
||
" if sample_features:\n",
|
||
" # 合并样本数据\n",
|
||
" X_sample = np.vstack(sample_features)[:config['sample_size']]\n",
|
||
" print(f\" 实际样本数: {X_sample.shape[0]}\")\n",
|
||
" print(f\" 原始特征数: {X_sample.shape[1]}\")\n",
|
||
" \n",
|
||
" # 标准化\n",
|
||
" GLOBAL_PCA['scaler'] = StandardScaler()\n",
|
||
" X_sample_scaled = GLOBAL_PCA['scaler'].fit_transform(X_sample)\n",
|
||
" \n",
|
||
" # 确定PCA成分数\n",
|
||
" if config['n_components'] is None:\n",
|
||
" print(f\" 🔍 自动选择PCA成分数...\")\n",
|
||
" pca_full = PCA()\n",
|
||
" pca_full.fit(X_sample_scaled)\n",
|
||
" \n",
|
||
" cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)\n",
|
||
" optimal_components = np.argmax(cumsum_var >= config['variance_threshold']) + 1\n",
|
||
" GLOBAL_PCA['n_components'] = min(optimal_components, X_sample.shape[1])\n",
|
||
" \n",
|
||
" print(f\" 保留{config['variance_threshold']*100}%方差需要: {optimal_components} 个成分\")\n",
|
||
" print(f\" 选择成分数: {GLOBAL_PCA['n_components']}\")\n",
|
||
" else:\n",
|
||
" GLOBAL_PCA['n_components'] = config['n_components']\n",
|
||
" print(f\" 使用指定成分数: {GLOBAL_PCA['n_components']}\")\n",
|
||
" \n",
|
||
" # 拟合最终PCA\n",
|
||
" GLOBAL_PCA['pca'] = PCA(n_components=GLOBAL_PCA['n_components'], random_state=42)\n",
|
||
" GLOBAL_PCA['pca'].fit(X_sample_scaled)\n",
|
||
" GLOBAL_PCA['is_fitted'] = True\n",
|
||
" \n",
|
||
" # 保存模型\n",
|
||
" pca_path = \"global_pca_model.joblib\"\n",
|
||
" joblib.dump({\n",
|
||
" 'scaler': GLOBAL_PCA['scaler'], \n",
|
||
" 'pca': GLOBAL_PCA['pca'],\n",
|
||
" 'n_components': GLOBAL_PCA['n_components']\n",
|
||
" }, pca_path)\n",
|
||
" \n",
|
||
" print(f\" ✅ 全局PCA拟合完成!\")\n",
|
||
" print(f\" 降维: {X_sample.shape[1]} → {GLOBAL_PCA['n_components']}\")\n",
|
||
" print(f\" 降维比例: {GLOBAL_PCA['n_components']/X_sample.shape[1]:.2%}\")\n",
|
||
" print(f\" 保留方差: {GLOBAL_PCA['pca'].explained_variance_ratio_.sum():.4f}\")\n",
|
||
" print(f\" 模型已保存: {pca_path}\")\n",
|
||
" \n",
|
||
" # 清理样本数据\n",
|
||
" del sample_features, X_sample, X_sample_scaled\n",
|
||
" gc.collect()\n",
|
||
" else:\n",
|
||
" print(\"❌ 无法收集样本数据用于PCA拟合\")\n",
|
||
"\n",
|
||
"def apply_pca_transform(features):\n",
|
||
" \"\"\"\n",
|
||
" 应用全局PCA变换\n",
|
||
" \"\"\"\n",
|
||
" if not PCA_CONFIG['enable_pca'] or not GLOBAL_PCA['is_fitted']:\n",
|
||
" return features\n",
|
||
" \n",
|
||
" # 标准化 + PCA变换\n",
|
||
" features_scaled = GLOBAL_PCA['scaler'].transform(features)\n",
|
||
" features_pca = GLOBAL_PCA['pca'].transform(features_scaled)\n",
|
||
" return features_pca\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 📊 数据平衡策略 - 标签分布分析与采样优化"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 【采样核心实现】\n",
|
||
"def balance_dataset(X, y, config=BALANCE_CONFIG):\n",
|
||
" \"\"\"\n",
|
||
" 对数据集进行平衡处理:下采样 + 过采样\n",
|
||
" \n",
|
||
" Args:\n",
|
||
" X: 特征数据\n",
|
||
" y: 标签数据\n",
|
||
" config: 平衡配置\n",
|
||
" \n",
|
||
" Returns:\n",
|
||
" X_balanced, y_balanced: 平衡后的数据\n",
|
||
" \"\"\"\n",
|
||
" if not config['enable_balance']:\n",
|
||
" print(\"🔕 数据平衡已禁用,返回原始数据\")\n",
|
||
" return X, y\n",
|
||
" \n",
|
||
" print(f\"\\n⚖️ 开始数据平衡处理...\")\n",
|
||
" print(f\" 原始数据: {X.shape[0]:,} 样本\")\n",
|
||
" \n",
|
||
" # 分析当前分布 (只考虑1-39号标签的均值)\n",
|
||
" label_counts = Counter(y)\n",
|
||
" counts_exclude_0_40 = [label_counts.get(i, 0) for i in range(1, 40)] # 1-39号标签\n",
|
||
" mean_count = np.mean(counts_exclude_0_40) # 只计算1-39号标签的均值\n",
|
||
" \n",
|
||
" print(f\" 均值样本数 (标签1-39): {mean_count:.0f}\")\n",
|
||
" print(f\" 下采样标签: {config['undersample_labels']}\")\n",
|
||
" print(f\" 过采样阈值: {config['oversample_threshold']} * 均值\")\n",
|
||
" \n",
|
||
" # 准备平衡后的数据\n",
|
||
" X_balanced = []\n",
|
||
" y_balanced = []\n",
|
||
" \n",
|
||
" random.seed(config['random_state'])\n",
|
||
" np.random.seed(config['random_state'])\n",
|
||
" \n",
|
||
" for label in range(41):\n",
|
||
" # 获取当前标签的所有样本\n",
|
||
" label_mask = (y == label)\n",
|
||
" X_label = X[label_mask]\n",
|
||
" y_label = y[label_mask]\n",
|
||
" current_count = len(y_label)\n",
|
||
" \n",
|
||
" if current_count == 0:\n",
|
||
" continue\n",
|
||
" \n",
|
||
" # 决定采样策略\n",
|
||
" if label in config['undersample_labels']:\n",
|
||
" # 下采样到均值水平\n",
|
||
" target_count = int(mean_count)\n",
|
||
" if current_count > target_count:\n",
|
||
" # 下采样\n",
|
||
" indices = np.random.choice(current_count, target_count, replace=False)\n",
|
||
" X_resampled = X_label[indices]\n",
|
||
" y_resampled = y_label[indices]\n",
|
||
" print(f\" 📉 标签 {label}: {current_count} → {target_count} (下采样)\")\n",
|
||
" else:\n",
|
||
" X_resampled = X_label\n",
|
||
" y_resampled = y_label\n",
|
||
" print(f\" ➡️ 标签 {label}: {current_count} (无需下采样)\")\n",
|
||
" \n",
|
||
" elif current_count < mean_count * config['oversample_threshold']:\n",
|
||
" # 过采样到阈值水平\n",
|
||
" target_count = int(mean_count * config['oversample_threshold'])\n",
|
||
" if current_count < target_count:\n",
|
||
" # 过采样\n",
|
||
" X_resampled, y_resampled = resample(\n",
|
||
" X_label, y_label, \n",
|
||
" n_samples=target_count, \n",
|
||
" random_state=config['random_state']\n",
|
||
" )\n",
|
||
" print(f\" 📈 标签 {label}: {current_count} → {target_count} (过采样)\")\n",
|
||
" else:\n",
|
||
" X_resampled = X_label\n",
|
||
" y_resampled = y_label\n",
|
||
" print(f\" ➡️ 标签 {label}: {current_count} (无需过采样)\")\n",
|
||
" else:\n",
|
||
" # 保持不变\n",
|
||
" X_resampled = X_label\n",
|
||
" y_resampled = y_label\n",
|
||
" print(f\" ✅ 标签 {label}: {current_count} (已平衡)\")\n",
|
||
" \n",
|
||
" X_balanced.append(X_resampled)\n",
|
||
" y_balanced.append(y_resampled)\n",
|
||
" \n",
|
||
" # 合并所有平衡后的数据\n",
|
||
" X_balanced = np.vstack(X_balanced)\n",
|
||
" y_balanced = np.hstack(y_balanced)\n",
|
||
" \n",
|
||
" # 随机打乱\n",
|
||
" shuffle_indices = np.random.permutation(len(y_balanced))\n",
|
||
" X_balanced = X_balanced[shuffle_indices]\n",
|
||
" y_balanced = y_balanced[shuffle_indices]\n",
|
||
" \n",
|
||
" print(f\" ✅ 平衡完成: {X_balanced.shape[0]:,} 样本\")\n",
|
||
" print(f\" 数据变化: {X.shape[0]:,} → {X_balanced.shape[0]:,} ({X_balanced.shape[0]/X.shape[0]:.2f}x)\")\n",
|
||
" \n",
|
||
" return X_balanced, y_balanced\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🔄 集成数据平衡的内存友好数据加载器"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🧪 数据平衡效果测试"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🚀 改进版智能数据处理管道"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🚀 创建智能数据处理管道...\n",
|
||
"✅ 管道创建完成,准备执行步骤1...\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🚀 改进版智能数据处理管道【没有解决分批训练的问题】\n",
|
||
"# 流程:分析分布 → 确定采样比率 → 拟合PCA(只下采样) → 数据处理(下采样+上采样+PCA)\n",
|
||
"\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from collections import Counter\n",
|
||
"from sklearn.utils import resample\n",
|
||
"from sklearn.decomposition import PCA\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"import joblib\n",
|
||
"import random\n",
|
||
"import gc\n",
|
||
"\n",
|
||
"class SmartDataPipeline:\n",
|
||
" \"\"\"\n",
|
||
" 智能数据处理管道\n",
|
||
" 步骤1: 分析数据分布,确定采样策略\n",
|
||
" 步骤2: 仅下采样拟合PCA参数\n",
|
||
" 步骤3: 数据处理时应用完整采样+PCA降维\n",
|
||
" \"\"\"\n",
|
||
" \n",
|
||
" def __init__(self, data_dir, random_state=42):\n",
|
||
" self.data_dir = data_dir\n",
|
||
" self.random_state = random_state\n",
|
||
" \n",
|
||
" # 步骤1: 分布分析结果\n",
|
||
" self.distribution_analysis = None\n",
|
||
" self.sampling_strategy = None\n",
|
||
" \n",
|
||
" # 步骤2: PCA参数(基于下采样数据拟合)\n",
|
||
" self.pca_scaler = None\n",
|
||
" self.pca_model = None\n",
|
||
" self.pca_components = None\n",
|
||
" self.pca_fitted = False\n",
|
||
" \n",
|
||
" # 配置参数\n",
|
||
" self.undersample_labels = [0, 40] # 需要下采样的标签\n",
|
||
" self.oversample_threshold = 0.5 # 过采样阈值(相对于均值)\n",
|
||
" self.pca_variance_threshold = 0.95 # PCA保留方差比例\n",
|
||
" self.pca_sample_size = 15000 # PCA拟合样本数\n",
|
||
" \n",
|
||
" def step1_analyze_distribution(self, max_samples=100000):\n",
|
||
" \"\"\"\n",
|
||
" 步骤1: 分析数据分布,确定采样策略\n",
|
||
" \"\"\"\n",
|
||
" print(\"🔍 步骤1: 分析数据分布...\")\n",
|
||
" \n",
|
||
" # 分析验证集分布(代表整体分布特征)\n",
|
||
" all_labels = []\n",
|
||
" for trials_batch, filename in load_data_batch(self.data_dir, 'val', 5000):\n",
|
||
" _, labels = extract_features_labels_batch(trials_batch)\n",
|
||
" all_labels.extend(labels.tolist())\n",
|
||
" if len(all_labels) >= max_samples:\n",
|
||
" break\n",
|
||
" \n",
|
||
" # 统计分析\n",
|
||
" label_counts = Counter(all_labels)\n",
|
||
" \n",
|
||
" # 计算1-39标签的均值(排除0和40)\n",
|
||
" counts_1_39 = [label_counts.get(i, 0) for i in range(1, 40)]\n",
|
||
" target_mean = np.mean(counts_1_39)\n",
|
||
" \n",
|
||
" # 生成采样策略\n",
|
||
" sampling_strategy = {}\n",
|
||
" for label in range(41):\n",
|
||
" current_count = label_counts.get(label, 0)\n",
|
||
" \n",
|
||
" if label in self.undersample_labels:\n",
|
||
" # 下采样到均值水平\n",
|
||
" target_count = int(target_mean)\n",
|
||
" action = 'undersample' if current_count > target_count else 'keep'\n",
|
||
" elif current_count < target_mean * self.oversample_threshold:\n",
|
||
" # 过采样到阈值水平\n",
|
||
" target_count = int(target_mean * self.oversample_threshold)\n",
|
||
" action = 'oversample' if current_count < target_count else 'keep'\n",
|
||
" else:\n",
|
||
" # 保持不变\n",
|
||
" target_count = current_count\n",
|
||
" action = 'keep'\n",
|
||
" \n",
|
||
" sampling_strategy[label] = {\n",
|
||
" 'current_count': current_count,\n",
|
||
" 'target_count': target_count,\n",
|
||
" 'action': action\n",
|
||
" }\n",
|
||
" \n",
|
||
" self.distribution_analysis = {\n",
|
||
" 'label_counts': label_counts,\n",
|
||
" 'target_mean': target_mean,\n",
|
||
" 'total_samples': len(all_labels)\n",
|
||
" }\n",
|
||
" self.sampling_strategy = sampling_strategy\n",
|
||
" \n",
|
||
" print(f\" ✅ 分析完成: {len(all_labels):,} 样本\")\n",
|
||
" print(f\" 📊 标签1-39均值: {target_mean:.0f}\")\n",
|
||
" print(f\" 📉 下采样标签: {self.undersample_labels} → {target_mean:.0f}\")\n",
|
||
" print(f\" 📈 过采样阈值: {self.oversample_threshold} × 均值 = {target_mean * self.oversample_threshold:.0f}\")\n",
|
||
" \n",
|
||
" return self.distribution_analysis, self.sampling_strategy\n",
|
||
"\n",
|
||
"# 创建智能数据处理管道\n",
|
||
"print(\"🚀 创建智能数据处理管道...\")\n",
|
||
"pipeline = SmartDataPipeline(data_dir, random_state=42)\n",
|
||
"print(\"✅ 管道创建完成,准备执行步骤1...\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ 步骤2方法已添加到管道\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 继续添加智能管道的其他方法【管道完善】\n",
|
||
"\n",
|
||
"def step2_fit_pca_with_undersampling(self):\n",
|
||
" \"\"\"\n",
|
||
" 步骤2: 仅对下采样数据拟合PCA参数(不进行过采样,避免PCA被过采样影响)\n",
|
||
" \"\"\"\n",
|
||
" if self.sampling_strategy is None:\n",
|
||
" raise ValueError(\"请先执行步骤1: step1_analyze_distribution()\")\n",
|
||
" \n",
|
||
" print(\"\\n🔧 步骤2: 拟合PCA参数(仅下采样,不过采样)...\")\n",
|
||
" \n",
|
||
" # 🔍 优先检查是否存在已保存的PCA模型\n",
|
||
" pca_path = \"smart_pipeline_pca.joblib\"\n",
|
||
" if os.path.exists(pca_path):\n",
|
||
" print(f\" 📁 发现已存在的PCA模型文件: {pca_path}\")\n",
|
||
" try:\n",
|
||
" # 加载已保存的PCA模型\n",
|
||
" pca_data = joblib.load(pca_path)\n",
|
||
" \n",
|
||
" self.pca_scaler = pca_data['scaler']\n",
|
||
" self.pca_model = pca_data['pca']\n",
|
||
" self.pca_components = pca_data['components']\n",
|
||
" self.pca_fitted = True\n",
|
||
" \n",
|
||
" print(f\" ✅ PCA模型加载成功!\")\n",
|
||
" print(f\" 降维: 7168 → {self.pca_components}\")\n",
|
||
" print(f\" 降维比例: {self.pca_components/7168:.2%}\")\n",
|
||
" print(f\" 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
|
||
" print(f\" 💡 跳过PCA拟合步骤,使用已保存的模型\")\n",
|
||
" return\n",
|
||
" \n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ PCA模型加载失败: {e}\")\n",
|
||
" print(f\" 🔄 将重新拟合PCA模型...\")\n",
|
||
" else:\n",
|
||
" print(f\" 📄 未找到已保存的PCA模型文件: {pca_path}\")\n",
|
||
" print(f\" 🔄 将从头拟合PCA模型...\")\n",
|
||
" \n",
|
||
" # 收集用于PCA拟合的样本(只下采样,不过采样)\n",
|
||
" pca_features = []\n",
|
||
" collected_samples = 0\n",
|
||
" \n",
|
||
" for trials_batch, filename in load_data_batch(self.data_dir, 'train', 3000):\n",
|
||
" features, labels = extract_features_labels_batch(trials_batch)\n",
|
||
" \n",
|
||
" # 对当前批次应用仅下采样策略\n",
|
||
" downsampled_features, downsampled_labels = self._apply_undersampling_only(features, labels)\n",
|
||
" \n",
|
||
" if downsampled_features.shape[0] > 0:\n",
|
||
" pca_features.append(downsampled_features)\n",
|
||
" collected_samples += downsampled_features.shape[0]\n",
|
||
" \n",
|
||
" if collected_samples >= self.pca_sample_size:\n",
|
||
" break\n",
|
||
" \n",
|
||
" if pca_features:\n",
|
||
" # 合并样本\n",
|
||
" X_pca_sample = np.vstack(pca_features)[:self.pca_sample_size]\n",
|
||
" print(f\" 📦 PCA拟合样本: {X_pca_sample.shape[0]:,} 个下采样样本\")\n",
|
||
" print(f\" 🔢 原始特征维度: {X_pca_sample.shape[1]}\")\n",
|
||
" \n",
|
||
" # 标准化\n",
|
||
" self.pca_scaler = StandardScaler()\n",
|
||
" X_scaled = self.pca_scaler.fit_transform(X_pca_sample)\n",
|
||
" \n",
|
||
" # 确定PCA成分数\n",
|
||
" pca_full = PCA(random_state=self.random_state)\n",
|
||
" pca_full.fit(X_scaled)\n",
|
||
" cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)\n",
|
||
" optimal_components = np.argmax(cumsum_var >= self.pca_variance_threshold) + 1\n",
|
||
" self.pca_components = min(optimal_components, X_pca_sample.shape[1])\n",
|
||
" \n",
|
||
" # 拟合最终PCA\n",
|
||
" self.pca_model = PCA(n_components=self.pca_components, random_state=self.random_state)\n",
|
||
" self.pca_model.fit(X_scaled)\n",
|
||
" self.pca_fitted = True\n",
|
||
" \n",
|
||
" # 保存PCA模型\n",
|
||
" pca_path = \"smart_pipeline_pca.joblib\"\n",
|
||
" joblib.dump({\n",
|
||
" 'scaler': self.pca_scaler,\n",
|
||
" 'pca': self.pca_model,\n",
|
||
" 'components': self.pca_components\n",
|
||
" }, pca_path)\n",
|
||
" \n",
|
||
" print(f\" ✅ PCA拟合完成!\")\n",
|
||
" print(f\" 降维: {X_pca_sample.shape[1]} → {self.pca_components}\")\n",
|
||
" print(f\" 降维比例: {self.pca_components/X_pca_sample.shape[1]:.2%}\")\n",
|
||
" print(f\" 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
|
||
" print(f\" 模型保存: {pca_path}\")\n",
|
||
" \n",
|
||
" # 清理内存\n",
|
||
" del pca_features, X_pca_sample, X_scaled\n",
|
||
" gc.collect()\n",
|
||
" else:\n",
|
||
" raise ValueError(\"无法收集PCA拟合样本\")\n",
|
||
"\n",
|
||
"def _apply_undersampling_only(self, X, y):\n",
|
||
" \"\"\"\n",
|
||
" 仅应用下采样策略(用于PCA拟合)\n",
|
||
" \"\"\"\n",
|
||
" X_result = []\n",
|
||
" y_result = []\n",
|
||
" \n",
|
||
" np.random.seed(self.random_state)\n",
|
||
" \n",
|
||
" for label in range(41):\n",
|
||
" label_mask = (y == label)\n",
|
||
" X_label = X[label_mask]\n",
|
||
" y_label = y[label_mask]\n",
|
||
" current_count = len(y_label)\n",
|
||
" \n",
|
||
" if current_count == 0:\n",
|
||
" continue\n",
|
||
" \n",
|
||
" strategy = self.sampling_strategy[label]\n",
|
||
" \n",
|
||
" if strategy['action'] == 'undersample' and current_count > strategy['target_count']:\n",
|
||
" # 下采样\n",
|
||
" indices = np.random.choice(current_count, strategy['target_count'], replace=False)\n",
|
||
" X_resampled = X_label[indices]\n",
|
||
" y_resampled = y_label[indices]\n",
|
||
" else:\n",
|
||
" # 保持原样\n",
|
||
" X_resampled = X_label\n",
|
||
" y_resampled = y_label\n",
|
||
" \n",
|
||
" X_result.append(X_resampled)\n",
|
||
" y_result.append(y_resampled)\n",
|
||
" \n",
|
||
" if X_result:\n",
|
||
" return np.vstack(X_result), np.hstack(y_result)\n",
|
||
" else:\n",
|
||
" return np.array([]).reshape(0, X.shape[1]), np.array([])\n",
|
||
"\n",
|
||
"# 动态添加方法到类\n",
|
||
"SmartDataPipeline.step2_fit_pca_with_undersampling = step2_fit_pca_with_undersampling\n",
|
||
"SmartDataPipeline._apply_undersampling_only = _apply_undersampling_only\n",
|
||
"\n",
|
||
"print(\"✅ 步骤2方法已添加到管道\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ 所有方法已添加到智能管道\n",
|
||
"\n",
|
||
"📋 智能数据处理管道状态:\n",
|
||
" 🔍 步骤1 - 分布分析: ❌ 未完成\n",
|
||
" 🔧 步骤2 - PCA拟合: ❌ 未完成\n",
|
||
"\n",
|
||
"🎯 使用流程:\n",
|
||
" 1. pipeline.step1_analyze_distribution()\n",
|
||
" 2. pipeline.step2_fit_pca_with_undersampling()\n",
|
||
" 3. pipeline.step3_process_data('train') # 训练集\n",
|
||
" pipeline.step3_process_data('val') # 验证集\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 添加智能管道的剩余方法\n",
|
||
"\n",
|
||
"def _apply_full_sampling(self, X, y):\n",
|
||
" \"\"\"\n",
|
||
" 应用完整的采样策略(下采样+过采样)\n",
|
||
" \"\"\"\n",
|
||
" X_result = []\n",
|
||
" y_result = []\n",
|
||
" \n",
|
||
" np.random.seed(self.random_state)\n",
|
||
" \n",
|
||
" for label in range(41):\n",
|
||
" label_mask = (y == label)\n",
|
||
" X_label = X[label_mask]\n",
|
||
" y_label = y[label_mask]\n",
|
||
" current_count = len(y_label)\n",
|
||
" \n",
|
||
" if current_count == 0:\n",
|
||
" continue\n",
|
||
" \n",
|
||
" strategy = self.sampling_strategy[label]\n",
|
||
" target_count = strategy['target_count']\n",
|
||
" \n",
|
||
" if strategy['action'] == 'undersample' and current_count > target_count:\n",
|
||
" # 下采样\n",
|
||
" indices = np.random.choice(current_count, target_count, replace=False)\n",
|
||
" X_resampled = X_label[indices]\n",
|
||
" y_resampled = y_label[indices]\n",
|
||
" elif strategy['action'] == 'oversample' and current_count < target_count:\n",
|
||
" # 过采样\n",
|
||
" X_resampled, y_resampled = resample(\n",
|
||
" X_label, y_label, \n",
|
||
" n_samples=target_count, \n",
|
||
" random_state=self.random_state\n",
|
||
" )\n",
|
||
" else:\n",
|
||
" # 保持原样\n",
|
||
" X_resampled = X_label\n",
|
||
" y_resampled = y_label\n",
|
||
" \n",
|
||
" X_result.append(X_resampled)\n",
|
||
" y_result.append(y_resampled)\n",
|
||
" \n",
|
||
" if X_result:\n",
|
||
" return np.vstack(X_result), np.hstack(y_result)\n",
|
||
" else:\n",
|
||
" return np.array([]).reshape(0, X.shape[1]), np.array([])\n",
|
||
"\n",
|
||
"def _apply_pca_transform(self, X):\n",
|
||
" \"\"\"\n",
|
||
" 应用PCA变换\n",
|
||
" \"\"\"\n",
|
||
" if not self.pca_fitted:\n",
|
||
" return X\n",
|
||
" \n",
|
||
" X_scaled = self.pca_scaler.transform(X)\n",
|
||
" X_pca = self.pca_model.transform(X_scaled)\n",
|
||
" return X_pca\n",
|
||
"\n",
|
||
"def step3_process_data(self, data_type, apply_sampling=None):\n",
|
||
" \"\"\"\n",
|
||
" 步骤3: 处理数据(采样+PCA降维)\n",
|
||
" \n",
|
||
" Args:\n",
|
||
" data_type: 'train', 'val', 'test'\n",
|
||
" apply_sampling: 是否应用采样策略,None=训练集应用,验证/测试集不应用\n",
|
||
" \"\"\"\n",
|
||
" if not self.pca_fitted:\n",
|
||
" raise ValueError(\"请先执行步骤2: step2_fit_pca_with_undersampling()\")\n",
|
||
" \n",
|
||
" if apply_sampling is None:\n",
|
||
" apply_sampling = (data_type == 'train')\n",
|
||
" \n",
|
||
" print(f\"\\n🔄 步骤3: 处理{data_type}数据...\")\n",
|
||
" print(f\" 采样策略: {'启用' if apply_sampling else '禁用'}\")\n",
|
||
" \n",
|
||
" all_features = []\n",
|
||
" all_labels = []\n",
|
||
" \n",
|
||
" for trials_batch, filename in load_data_batch(self.data_dir, data_type, 3000):\n",
|
||
" features, labels = extract_features_labels_batch(trials_batch)\n",
|
||
" \n",
|
||
" # 应用采样策略\n",
|
||
" if apply_sampling:\n",
|
||
" features_sampled, labels_sampled = self._apply_full_sampling(features, labels)\n",
|
||
" else:\n",
|
||
" features_sampled, labels_sampled = features, labels\n",
|
||
" \n",
|
||
" # 应用PCA降维\n",
|
||
" if features_sampled.shape[0] > 0:\n",
|
||
" features_pca = self._apply_pca_transform(features_sampled)\n",
|
||
" all_features.append(features_pca)\n",
|
||
" all_labels.append(labels_sampled)\n",
|
||
" \n",
|
||
" if all_features:\n",
|
||
" X = np.vstack(all_features)\n",
|
||
" y = np.hstack(all_labels)\n",
|
||
" \n",
|
||
" # 随机打乱\n",
|
||
" shuffle_indices = np.random.permutation(len(y))\n",
|
||
" X = X[shuffle_indices]\n",
|
||
" y = y[shuffle_indices]\n",
|
||
" \n",
|
||
" print(f\" ✅ 处理完成: {X.shape[0]:,} 样本, {X.shape[1]} 特征\")\n",
|
||
" \n",
|
||
" # 清理内存\n",
|
||
" del all_features, all_labels\n",
|
||
" gc.collect()\n",
|
||
" \n",
|
||
" return X, y\n",
|
||
" else:\n",
|
||
" return None, None\n",
|
||
"\n",
|
||
"def print_summary(self):\n",
|
||
" \"\"\"\n",
|
||
" 打印管道状态总结\n",
|
||
" \"\"\"\n",
|
||
" print(\"\\n📋 智能数据处理管道状态:\")\n",
|
||
" print(f\" 🔍 步骤1 - 分布分析: {'✅ 完成' if self.distribution_analysis else '❌ 未完成'}\")\n",
|
||
" print(f\" 🔧 步骤2 - PCA拟合: {'✅ 完成' if self.pca_fitted else '❌ 未完成'}\")\n",
|
||
" \n",
|
||
" if self.distribution_analysis:\n",
|
||
" target_mean = self.distribution_analysis['target_mean']\n",
|
||
" print(f\" 📊 标签1-39均值: {target_mean:.0f}\")\n",
|
||
" \n",
|
||
" if self.pca_fitted:\n",
|
||
" print(f\" 🔬 PCA降维: 7168 → {self.pca_components} ({self.pca_components/7168:.1%})\")\n",
|
||
" print(f\" 📈 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
|
||
" \n",
|
||
" print(f\"\\n🎯 使用流程:\")\n",
|
||
" print(f\" 1. pipeline.step1_analyze_distribution()\")\n",
|
||
" print(f\" 2. pipeline.step2_fit_pca_with_undersampling()\")\n",
|
||
" print(f\" 3. pipeline.step3_process_data('train') # 训练集\")\n",
|
||
" print(f\" pipeline.step3_process_data('val') # 验证集\")\n",
|
||
"\n",
|
||
"# 动态添加剩余方法到类\n",
|
||
"SmartDataPipeline._apply_full_sampling = _apply_full_sampling\n",
|
||
"SmartDataPipeline._apply_pca_transform = _apply_pca_transform\n",
|
||
"SmartDataPipeline.step3_process_data = step3_process_data\n",
|
||
"SmartDataPipeline.print_summary = print_summary\n",
|
||
"\n",
|
||
"print(\"✅ 所有方法已添加到智能管道\")\n",
|
||
"pipeline.print_summary()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🔥 执行智能数据处理管道"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🚀 开始执行智能数据处理管道...\n",
|
||
"============================================================\n",
|
||
"\n",
|
||
"======================🔍 STEP 1: 分析数据分布======================\n",
|
||
"🔍 步骤1: 分析数据分布...\n",
|
||
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
|
||
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
|
||
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
|
||
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
|
||
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
|
||
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
|
||
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
|
||
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
|
||
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
|
||
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
|
||
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
|
||
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
|
||
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
|
||
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n",
|
||
" ✅ 分析完成: 108,742 样本\n",
|
||
" 📊 标签1-39均值: 455\n",
|
||
" 📉 下采样标签: [0, 40] → 455\n",
|
||
" 📈 过采样阈值: 0.5 × 均值 = 227\n",
|
||
"\n",
|
||
"📊 采样策略总结:\n",
|
||
" 📉 下采样标签: 2 个\n",
|
||
" 📈 过采样标签: 11 个\n",
|
||
" ✅ 保持不变: 28 个\n",
|
||
"\n",
|
||
"✅ 步骤1完成!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🔥 执行智能数据处理管道【确定采样策略】\n",
|
||
"\n",
|
||
"print(\"🚀 开始执行智能数据处理管道...\")\n",
|
||
"print(\"=\" * 60)\n",
|
||
"\n",
|
||
"# 步骤1: 分析数据分布\n",
|
||
"print(\"\\n\" + \"🔍 STEP 1: 分析数据分布\".center(60, \"=\"))\n",
|
||
"distribution, strategy = pipeline.step1_analyze_distribution()\n",
|
||
"\n",
|
||
"# 显示采样策略总结\n",
|
||
"print(f\"\\n📊 采样策略总结:\")\n",
|
||
"undersample_count = sum(1 for s in strategy.values() if s['action'] == 'undersample')\n",
|
||
"oversample_count = sum(1 for s in strategy.values() if s['action'] == 'oversample')\n",
|
||
"keep_count = sum(1 for s in strategy.values() if s['action'] == 'keep')\n",
|
||
"\n",
|
||
"print(f\" 📉 下采样标签: {undersample_count} 个\")\n",
|
||
"print(f\" 📈 过采样标签: {oversample_count} 个\") \n",
|
||
"print(f\" ✅ 保持不变: {keep_count} 个\")\n",
|
||
"\n",
|
||
"print(\"\\n✅ 步骤1完成!\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=====================🔧 STEP 2: 拟合PCA参数======================\n",
|
||
"\n",
|
||
"🔧 步骤2: 拟合PCA参数(仅下采样,不过采样)...\n",
|
||
" 📁 发现已存在的PCA模型文件: smart_pipeline_pca.joblib\n",
|
||
" ✅ PCA模型加载成功!\n",
|
||
" 降维: 7168 → 1219\n",
|
||
" 降维比例: 17.01%\n",
|
||
" 保留方差: 0.9491\n",
|
||
" 💡 跳过PCA拟合步骤,使用已保存的模型\n",
|
||
"\n",
|
||
"✅ 步骤2完成!\n",
|
||
"\n",
|
||
"📋 智能数据处理管道状态:\n",
|
||
" 🔍 步骤1 - 分布分析: ✅ 完成\n",
|
||
" 🔧 步骤2 - PCA拟合: ✅ 完成\n",
|
||
" 📊 标签1-39均值: 455\n",
|
||
" 🔬 PCA降维: 7168 → 1219 (17.0%)\n",
|
||
" 📈 保留方差: 0.9491\n",
|
||
"\n",
|
||
"🎯 使用流程:\n",
|
||
" 1. pipeline.step1_analyze_distribution()\n",
|
||
" 2. pipeline.step2_fit_pca_with_undersampling()\n",
|
||
" 3. pipeline.step3_process_data('train') # 训练集\n",
|
||
" pipeline.step3_process_data('val') # 验证集\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 步骤2: 拟合PCA参数【确定PCA策略】\n",
|
||
"print(\"\\n\" + \"🔧 STEP 2: 拟合PCA参数\".center(60, \"=\"))\n",
|
||
"pipeline.step2_fit_pca_with_undersampling()\n",
|
||
"\n",
|
||
"print(\"\\n✅ 步骤2完成!\")\n",
|
||
"pipeline.print_summary()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🚀 使用智能管道进行分批训练"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🚀 创建智能分批训练器...\n",
|
||
"🎯 智能分批训练器创建完成\n",
|
||
" 🔧 LightGBM参数已配置:CPU模式\n",
|
||
" 💡 学习率调度: 带重启的余弦退火 (从 0.08 到 0.001)\n",
|
||
" 🔄 重启参数: T_0=50, T_mult=2\n",
|
||
"✅ 训练器创建完成,准备开始训练!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🚀 使用智能管道进行分批训练\n",
|
||
"\n",
|
||
"import lightgbm as lgb\n",
|
||
"import time\n",
|
||
"from collections import Counter\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"class SmartBatchTrainer:\n",
|
||
" \"\"\"\n",
|
||
" 智能分批训练器,集成智能数据管道\n",
|
||
" \"\"\"\n",
|
||
" \n",
|
||
" def __init__(self, pipeline, params=None, min_learning_rate=1e-4, t_0=50, t_mult=2):\n",
|
||
" self.pipeline = pipeline\n",
|
||
" self.model = None\n",
|
||
" self.training_history = [] # 改为字典,因为只有一次训练\n",
|
||
" self.batch_count = 0\n",
|
||
" self.min_learning_rate = min_learning_rate\n",
|
||
" self.lr_history = [] # 用于可视化\n",
|
||
" \n",
|
||
" # 带重启的余弦退火参数\n",
|
||
" self.t_0 = t_0 # 第一个重启周期的长度\n",
|
||
" self.t_mult = t_mult # 重启周期的乘数\n",
|
||
" \n",
|
||
" # 默认LightGBM参数(GPU优化)\n",
|
||
" self.params = params or {\n",
|
||
" 'objective': 'multiclass',\n",
|
||
" 'num_class': 41,\n",
|
||
" 'metric': 'multi_logloss',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'device_type': 'cpu',\n",
|
||
" # 'gpu_platform_id': 0,\n",
|
||
" # 'gpu_device_id': 0,\n",
|
||
" 'max_bin': 255,\n",
|
||
" 'num_leaves': 127,\n",
|
||
" 'learning_rate': 0.08, #默认0.08\n",
|
||
" 'feature_fraction': 0.8,\n",
|
||
" 'bagging_fraction': 0.8,\n",
|
||
" 'bagging_freq': 5,\n",
|
||
" 'min_data_in_leaf': 20,\n",
|
||
" 'lambda_l1': 0.1,\n",
|
||
" 'lambda_l2': 0.1,\n",
|
||
" 'verbose': -1,\n",
|
||
" 'num_threads': -1\n",
|
||
" }\n",
|
||
" \n",
|
||
" self.initial_learning_rate = self.params.get('learning_rate', 0.08)\n",
|
||
" \n",
|
||
" print(f\"🎯 智能分批训练器创建完成\")\n",
|
||
" print(f\" 🔧 LightGBM参数已配置:{self.params['device_type'].upper()}模式\")\n",
|
||
" print(f\" 💡 学习率调度: 带重启的余弦退火 (从 {self.initial_learning_rate} 到 {self.min_learning_rate})\")\n",
|
||
" print(f\" 🔄 重启参数: T_0={self.t_0}, T_mult={self.t_mult}\")\n",
|
||
" \n",
|
||
" def prepare_validation_data(self):\n",
|
||
" \"\"\"\n",
|
||
" 准备验证数据(仅PCA,保持原始分布)\n",
|
||
" \"\"\"\n",
|
||
" print(\"🔄 准备验证数据...\")\n",
|
||
" self.X_val, self.y_val = self.pipeline.step3_process_data('val', apply_sampling=False)\n",
|
||
" if self.X_val is None:\n",
|
||
" raise ValueError(\"无法加载验证数据\")\n",
|
||
" val_counts = Counter(self.y_val)\n",
|
||
" print(f\" ✅ 验证数据准备完成: {self.X_val.shape[0]:,} 样本\")\n",
|
||
" print(f\" 📊 验证集分布 (标签0: {val_counts.get(0, 0):,}, 标签40: {val_counts.get(40, 0):,})\")\n",
|
||
"\n",
|
||
" return lgb.Dataset(self.X_val, label=self.y_val, free_raw_data=False)\n",
|
||
"\n",
|
||
" def get_training_batch_generator(self):\n",
|
||
" \"\"\"\n",
|
||
" 获取训练批次生成器(平衡采样+PCA)\n",
|
||
" \"\"\"\n",
|
||
" print(\"🔄 准备训练批次生成器...\")\n",
|
||
" \n",
|
||
" # 使用管道的批次生成器\n",
|
||
" for trials_batch, filename in load_data_batch(self.pipeline.data_dir, 'train', 2000):\n",
|
||
" features, labels = extract_features_labels_batch(trials_batch)\n",
|
||
" \n",
|
||
" # 应用完整采样策略\n",
|
||
" features_sampled, labels_sampled = self.pipeline._apply_full_sampling(features, labels)\n",
|
||
" \n",
|
||
" # 应用PCA降维\n",
|
||
" if features_sampled.shape[0] > 0:\n",
|
||
" features_pca = self.pipeline._apply_pca_transform(features_sampled)\n",
|
||
" \n",
|
||
" # 分析当前批次分布\n",
|
||
" batch_counts = Counter(labels_sampled)\n",
|
||
" \n",
|
||
" print(f\" 📦 批次: {filename}\")\n",
|
||
" print(f\" 样本数: {features_pca.shape[0]:,}\")\n",
|
||
" print(f\" 平衡后分布: 标签0={batch_counts.get(0,0)}, 标签40={batch_counts.get(40,0)}\")\n",
|
||
" \n",
|
||
" yield lgb.Dataset(features_pca, label=labels_sampled), filename\n",
|
||
" \n",
|
||
" def prepare_full_data(self):\n",
|
||
" \"\"\"\n",
|
||
" 一次性准备所有训练和验证数据\n",
|
||
" \"\"\"\n",
|
||
" print(\"🔄 准备全量训练和验证数据...\")\n",
|
||
" \n",
|
||
" # 1. 准备验证数据 (保持原始分布)\n",
|
||
" X_val, y_val = self.pipeline.step3_process_data('val', apply_sampling=False)\n",
|
||
" if X_val is None:\n",
|
||
" raise ValueError(\"无法加载验证数据\")\n",
|
||
" val_counts = Counter(y_val)\n",
|
||
" print(f\" ✅ 验证数据准备完成: {X_val.shape[0]:,} 样本\")\n",
|
||
" print(f\" 📊 验证集分布 (标签0: {val_counts.get(0, 0):,}, 标签40: {val_counts.get(40, 0):,})\")\n",
|
||
" val_data = lgb.Dataset(X_val, label=y_val, free_raw_data=False)\n",
|
||
" \n",
|
||
" # 2. 准备训练数据 (应用完整采样和PCA策略)\n",
|
||
" X_train, y_train = self.pipeline.step3_process_data('train', apply_sampling=True)\n",
|
||
" if X_train is None:\n",
|
||
" raise ValueError(\"无法加载训练数据\")\n",
|
||
" train_counts = Counter(y_train)\n",
|
||
" print(f\" ✅ 训练数据准备完成: {X_train.shape[0]:,} 样本, {X_train.shape[1]} 特征\")\n",
|
||
" print(f\" 📊 训练集(采样后)分布 (标签0: {train_counts.get(0, 0):,}, 标签40: {train_counts.get(40, 0):,})\")\n",
|
||
" train_data = lgb.Dataset(X_train, label=y_train)\n",
|
||
" \n",
|
||
" return train_data, val_data, X_val, y_val\n",
|
||
" \n",
|
||
" def prepare_training_data(self):\n",
|
||
" \"\"\"\n",
|
||
" 准备训练数据(仅PCA,保持原始分布)\n",
|
||
" \"\"\"\n",
|
||
" print(\"🔄 准备训练数据...\")\n",
|
||
" # 2. 准备训练数据 (应用完整采样和PCA策略)\n",
|
||
" X_train, y_train = self.pipeline.step3_process_data('train', apply_sampling=True)\n",
|
||
" if X_train is None:\n",
|
||
" raise ValueError(\"无法加载训练数据\")\n",
|
||
" train_counts = Counter(y_train)\n",
|
||
" print(f\" ✅ 训练数据准备完成: {X_train.shape[0]:,} 样本, {X_train.shape[1]} 特征\")\n",
|
||
" print(f\" 📊 训练集(采样后)分布 (标签0: {train_counts.get(0, 0):,}, 标签40: {train_counts.get(40, 0):,})\")\n",
|
||
" \n",
|
||
" return lgb.Dataset(X_train, label=y_train, free_raw_data=False)\n",
|
||
" \n",
|
||
" # 带重启的余弦退火调度器函数\n",
|
||
" def _cosine_annealing_with_warm_restarts(self, current_round):\n",
|
||
" \"\"\"\n",
|
||
" 带重启的余弦退火调度器 (SGDR)\n",
|
||
" \n",
|
||
" Args:\n",
|
||
" current_round: 当前训练轮数\n",
|
||
" \n",
|
||
" Returns:\n",
|
||
" 学习率\n",
|
||
" \"\"\"\n",
|
||
" eta_max = self.initial_learning_rate\n",
|
||
" eta_min = self.min_learning_rate\n",
|
||
" \n",
|
||
" # 计算当前在哪个重启周期中\n",
|
||
" t_cur = current_round\n",
|
||
" t_i = self.t_0\n",
|
||
" \n",
|
||
" # 找到当前的重启周期\n",
|
||
" cycle = 0\n",
|
||
" while t_cur >= t_i:\n",
|
||
" t_cur -= t_i\n",
|
||
" cycle += 1\n",
|
||
" t_i *= self.t_mult\n",
|
||
" \n",
|
||
" # 在当前周期内的位置\n",
|
||
" progress = t_cur / t_i\n",
|
||
" \n",
|
||
" # 计算学习率\n",
|
||
" lr = eta_min + 0.5 * (eta_max - eta_min) * (1 + np.cos(np.pi * progress))\n",
|
||
" \n",
|
||
" return lr\n",
|
||
" \n",
|
||
" def train_incremental(self, num_boost_round=100, early_stopping_rounds=10):\n",
|
||
" \"\"\"\n",
|
||
" 增量分批训练\n",
|
||
" \"\"\"\n",
|
||
" print(f\"\\n🚀 开始智能分批训练...\")\n",
|
||
" print(f\" 📝 训练轮数 (每批次): {num_boost_round}\")\n",
|
||
" print(f\" ⏹️ 早停轮数: {early_stopping_rounds}\")\n",
|
||
" print(\"=\" * 60)\n",
|
||
" \n",
|
||
" # 准备验证数据\n",
|
||
" val_data = self.prepare_validation_data()\n",
|
||
" \n",
|
||
" print(f\"\\n🔄 开始分批增量训练...\")\n",
|
||
" total_start_time = time.time()\n",
|
||
" \n",
|
||
" # ⭐️ 新增: 为学习率调度器定义T_max\n",
|
||
" # 我们将每个批次的训练视为一个完整的退火周期\n",
|
||
" t_max_per_batch = num_boost_round\n",
|
||
" \n",
|
||
" for train_data, filename in self.get_training_batch_generator():\n",
|
||
" self.batch_count += 1\n",
|
||
" batch_start_time = time.time()\n",
|
||
" self.last_batch_lr_history = [] # 重置每个批次的LR历史\n",
|
||
" \n",
|
||
" print(f\"\\n📈 批次 {self.batch_count}: {filename}\")\n",
|
||
" \n",
|
||
" # ⭐️ 新增: 创建学习率调度回调 和 记录回调\n",
|
||
" lr_scheduler_callback = lgb.reset_parameter(\n",
|
||
" learning_rate=lambda current_round: self._cosine_annealing_with_warm_restarts(current_round)\n",
|
||
" )\n",
|
||
"\n",
|
||
" # 这个简单的回调用于记录每个周期的学习率,以便后续可视化\n",
|
||
" def record_lr_callback(env):\n",
|
||
" self.last_batch_lr_history.append(env.model.params['learning_rate'])\n",
|
||
"\n",
|
||
" # 组合所有回调\n",
|
||
" training_callbacks = [\n",
|
||
" lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True),\n",
|
||
" lgb.log_evaluation(period=10), # 每10轮打印一次\n",
|
||
" lr_scheduler_callback,\n",
|
||
" record_lr_callback\n",
|
||
" ]\n",
|
||
"\n",
|
||
" # 训练当前批次\n",
|
||
" current_model_args = {\n",
|
||
" 'params': self.params,\n",
|
||
" 'train_set': train_data,\n",
|
||
" 'num_boost_round': num_boost_round,\n",
|
||
" 'valid_sets': [val_data],\n",
|
||
" 'valid_names': ['validation'],\n",
|
||
" 'callbacks': training_callbacks\n",
|
||
" }\n",
|
||
" \n",
|
||
" if self.model is None:\n",
|
||
" print(\" 🎯 初始模型训练...\")\n",
|
||
" self.model = lgb.train(**current_model_args)\n",
|
||
" else:\n",
|
||
" print(\" ⚡ 增量训练...\")\n",
|
||
" current_model_args['init_model'] = self.model\n",
|
||
" self.model = lgb.train(**current_model_args)\n",
|
||
"\n",
|
||
" # 记录训练历史\n",
|
||
" batch_time = time.time() - batch_start_time\n",
|
||
" \n",
|
||
" # 评估当前模型\n",
|
||
" val_pred = self.model.predict(self.X_val)\n",
|
||
" val_accuracy = (val_pred.argmax(axis=1) == self.y_val).mean()\n",
|
||
" \n",
|
||
" batch_info = {\n",
|
||
" 'batch': self.batch_count,\n",
|
||
" 'filename': filename,\n",
|
||
" 'time': batch_time,\n",
|
||
" 'val_accuracy': val_accuracy,\n",
|
||
" 'num_trees': self.model.num_trees(),\n",
|
||
" 'lr_history': self.last_batch_lr_history.copy() # 保存当前批次的LR历史\n",
|
||
" }\n",
|
||
" \n",
|
||
" self.training_history.append(batch_info)\n",
|
||
" \n",
|
||
" print(f\" ✅ 批次完成: {batch_time:.1f}秒\")\n",
|
||
" print(f\" 📊 验证准确率: {val_accuracy:.4f}\")\n",
|
||
" print(f\" 🌳 模型树数: {self.model.num_trees()}\")\n",
|
||
" \n",
|
||
" model_path = f\"smart_batch_model_batch_{self.batch_count}.txt\"\n",
|
||
" self.model.save_model(model_path)\n",
|
||
" print(f\" 💾 模型已保存: {model_path}\")\n",
|
||
" \n",
|
||
" total_time = time.time() - total_start_time\n",
|
||
" print(f\"\\n🎉 智能分批训练完成!\")\n",
|
||
" print(f\" ⏱️ 总训练时间: {total_time:.1f}秒\")\n",
|
||
" print(f\" 📊 处理批次数: {self.batch_count}\")\n",
|
||
" print(f\" 🌳 最终模型树数: {self.model.num_trees()}\")\n",
|
||
" \n",
|
||
" return self.model\n",
|
||
" \n",
|
||
" def train(self, num_boost_round=1000, early_stopping_rounds=50):\n",
|
||
" \"\"\"\n",
|
||
" 执行一次性全量训练\n",
|
||
" \"\"\"\n",
|
||
" print(f\"\\n🚀 开始全量数据训练...\")\n",
|
||
" print(f\" 📝 训练轮数: {num_boost_round}\")\n",
|
||
" print(f\" ⏹️ 早停轮数: {early_stopping_rounds}\")\n",
|
||
" print(\"=\" * 60)\n",
|
||
" \n",
|
||
" # 准备数据\n",
|
||
" train_data, val_data, X_val, y_val = self.prepare_full_data()\n",
|
||
" \n",
|
||
" start_time = time.time()\n",
|
||
" \n",
|
||
" # 定义学习率调度和记录回调\n",
|
||
" lr_scheduler_callback = lgb.reset_parameter(\n",
|
||
" learning_rate=lambda current_round: self._cosine_annealing_with_warm_restarts(current_round)\n",
|
||
" )\n",
|
||
" def record_lr_callback(env):\n",
|
||
" self.lr_history.append(env.model.params['learning_rate'])\n",
|
||
" \n",
|
||
" training_callbacks = [\n",
|
||
" lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True),\n",
|
||
" lgb.log_evaluation(period=1), # 每100轮打印日志\n",
|
||
" lr_scheduler_callback,\n",
|
||
" record_lr_callback\n",
|
||
" ]\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" print(\"\\n📈 开始模型训练...\")\n",
|
||
" self.model = lgb.train(\n",
|
||
" params=self.params,\n",
|
||
" train_set=train_data,\n",
|
||
" num_boost_round=num_boost_round,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" valid_names=['validation'],\n",
|
||
" callbacks=training_callbacks\n",
|
||
" )\n",
|
||
" \n",
|
||
" training_time = time.time() - start_time\n",
|
||
" \n",
|
||
" # 评估模型\n",
|
||
" val_pred = self.model.predict(X_val)\n",
|
||
" val_accuracy = (val_pred.argmax(axis=1) == y_val).mean()\n",
|
||
" \n",
|
||
" # 记录训练历史\n",
|
||
" self.training_history = {\n",
|
||
" 'time': training_time,\n",
|
||
" 'val_accuracy': val_accuracy,\n",
|
||
" 'num_trees': self.model.num_trees(),\n",
|
||
" 'lr_history': self.lr_history,\n",
|
||
" 'best_iteration': self.model.best_iteration\n",
|
||
" }\n",
|
||
" \n",
|
||
" print(f\"\\n🎉 全量数据训练完成!\")\n",
|
||
" print(f\" ⏱️ 总训练时间: {training_time:.1f}秒\")\n",
|
||
" print(f\" 🌳 最终模型树数: {self.model.num_trees()} (最佳轮次: {self.model.best_iteration})\")\n",
|
||
" print(f\" 🎯 最终验证准确率: {val_accuracy:.4f}\")\n",
|
||
" \n",
|
||
" # 保存模型\n",
|
||
" model_path = \"full_train_model.txt\"\n",
|
||
" self.model.save_model(model_path)\n",
|
||
" print(f\" 💾 模型已保存: {model_path}\")\n",
|
||
" \n",
|
||
" return self.model\n",
|
||
" \n",
|
||
" def plot_training_progress(self):\n",
|
||
" \"\"\"\n",
|
||
" 绘制训练进度\n",
|
||
" \"\"\"\n",
|
||
" if not self.training_history:\n",
|
||
" print(\"❌ 没有训练历史记录\")\n",
|
||
" return\n",
|
||
" \n",
|
||
" # ⭐️ 修改: 增加学习率的可视化图表\n",
|
||
" fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 15))\n",
|
||
" \n",
|
||
" batches = [h['batch'] for h in self.training_history]\n",
|
||
" accuracies = [h['val_accuracy'] for h in self.training_history]\n",
|
||
" times = [h['time'] for h in self.training_history]\n",
|
||
" trees = [h['num_trees'] for h in self.training_history]\n",
|
||
" \n",
|
||
" # 1. 验证准确率\n",
|
||
" ax1.plot(batches, accuracies, 'b-o', linewidth=2, markersize=6)\n",
|
||
" ax1.set_xlabel('Training Batch')\n",
|
||
" ax1.set_ylabel('Validation Accuracy')\n",
|
||
" ax1.set_title('Validation Accuracy Progress')\n",
|
||
" ax1.grid(True, alpha=0.3)\n",
|
||
" ax1.set_ylim(0, 1)\n",
|
||
" \n",
|
||
" # 2. 批次训练时间\n",
|
||
" ax2.bar(batches, times, color='green', alpha=0.7)\n",
|
||
" ax2.set_xlabel('Training Batch')\n",
|
||
" ax2.set_ylabel('Training Time (seconds)')\n",
|
||
" ax2.set_title('Training Time per Batch')\n",
|
||
" ax2.grid(True, alpha=0.3)\n",
|
||
" \n",
|
||
" # 3. 模型树数增长\n",
|
||
" ax3.plot(batches, trees, 'r-s', linewidth=2, markersize=6)\n",
|
||
" ax3.set_xlabel('Training Batch')\n",
|
||
" ax3.set_ylabel('Number of Trees')\n",
|
||
" ax3.set_title('Model Complexity Growth')\n",
|
||
" ax3.grid(True, alpha=0.3)\n",
|
||
" \n",
|
||
" # 4. 累计准确率提升\n",
|
||
" ax4.plot(batches, [acc - accuracies[0] for acc in accuracies], 'purple', linewidth=2, marker='D')\n",
|
||
" ax4.set_xlabel('Training Batch')\n",
|
||
" ax4.set_ylabel('Accuracy Improvement')\n",
|
||
" ax4.set_title('Cumulative Accuracy Improvement')\n",
|
||
" ax4.grid(True, alpha=0.3)\n",
|
||
" ax4.axhline(y=0, color='black', linestyle='--', alpha=0.5)\n",
|
||
"\n",
|
||
" # ⭐️ 新增: 5. 最后一个批次的学习率曲线\n",
|
||
" last_lr_history = self.training_history[-1]['lr_history']\n",
|
||
" ax5.plot(range(len(last_lr_history)), last_lr_history, color='orange', marker='.')\n",
|
||
" ax5.set_xlabel('Boosting Round in Last Batch')\n",
|
||
" ax5.set_ylabel('Learning Rate')\n",
|
||
" ax5.set_title(f'Cosine Annealing LR in Last Batch (Batch {batches[-1]})')\n",
|
||
" ax5.grid(True, alpha=0.3)\n",
|
||
" \n",
|
||
" # 隐藏第六个子图\n",
|
||
" ax6.axis('off')\n",
|
||
"\n",
|
||
" plt.tight_layout()\n",
|
||
" plt.show()\n",
|
||
" \n",
|
||
" # 打印统计信息\n",
|
||
" print(f\"\\n📈 训练进度统计:\")\n",
|
||
" print(f\" 🎯 初始准确率: {accuracies[0]:.4f}\")\n",
|
||
" print(f\" 🎯 最终准确率: {accuracies[-1]:.4f}\")\n",
|
||
" print(f\" 📈 准确率提升: {accuracies[-1] - accuracies[0]:.4f}\")\n",
|
||
" print(f\" ⏱️ 平均批次时间: {np.mean(times):.1f}秒\")\n",
|
||
" print(f\" 🌳 最终模型树数: {trees[-1]}\")\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"🚀 创建智能分批训练器...\")\n",
|
||
"# 实例化时可以传入最小学习率\n",
|
||
"trainer = SmartBatchTrainer(pipeline, min_learning_rate=0.001) \n",
|
||
"print(\"✅ 训练器创建完成,准备开始训练!\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"📁 找到模型文件: smart_batch_model_batch_1.txt\n",
|
||
"✅ 模型加载成功!\n",
|
||
" 🌳 模型树数: 27593\n",
|
||
" 🔧 模型类型: LightGBM Booster\n",
|
||
" 📂 来源文件: smart_batch_model_batch_1.txt\n",
|
||
" 💡 提示: 运行验证评估需要先准备验证数据\n",
|
||
"\n",
|
||
"🎯 训练器状态更新:\n",
|
||
" ✅ 模型: 已加载 (27593 棵树)\n",
|
||
" 📊 管道: 已配置\n",
|
||
" 🔬 PCA: 已拟合\n",
|
||
"\n",
|
||
"💡 现在可以运行:\n",
|
||
" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\n",
|
||
" - 预测新数据\n",
|
||
" - 继续训练 (如果需要)\n",
|
||
"✅ 模型加载成功!\n",
|
||
" 🌳 模型树数: 27593\n",
|
||
" 🔧 模型类型: LightGBM Booster\n",
|
||
" 📂 来源文件: smart_batch_model_batch_1.txt\n",
|
||
" 💡 提示: 运行验证评估需要先准备验证数据\n",
|
||
"\n",
|
||
"🎯 训练器状态更新:\n",
|
||
" ✅ 模型: 已加载 (27593 棵树)\n",
|
||
" 📊 管道: 已配置\n",
|
||
" 🔬 PCA: 已拟合\n",
|
||
"\n",
|
||
"💡 现在可以运行:\n",
|
||
" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\n",
|
||
" - 预测新数据\n",
|
||
" - 继续训练 (如果需要)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🔄 从文件加载已训练的模型\n",
|
||
"import lightgbm as lgb\n",
|
||
"import os\n",
|
||
"\n",
|
||
"model_path = \"smart_batch_model_batch_1.txt\"\n",
|
||
"\n",
|
||
"if os.path.exists(model_path):\n",
|
||
" print(f\"📁 找到模型文件: {model_path}\")\n",
|
||
" try:\n",
|
||
" # 加载LightGBM模型\n",
|
||
" trainer.model = lgb.Booster(model_file=model_path)\n",
|
||
" \n",
|
||
" print(f\"✅ 模型加载成功!\")\n",
|
||
" print(f\" 🌳 模型树数: {trainer.model.num_trees()}\")\n",
|
||
" print(f\" 🔧 模型类型: LightGBM Booster\")\n",
|
||
" print(f\" 📂 来源文件: {model_path}\")\n",
|
||
" \n",
|
||
" # 验证模型是否可用\n",
|
||
" if hasattr(trainer, 'X_val') and trainer.X_val is not None:\n",
|
||
" # 如果验证数据已准备,进行快速测试\n",
|
||
" test_pred = trainer.model.predict(trainer.X_val[:100]) # 测试前100个样本\n",
|
||
" print(f\" 🧪 模型测试: 预测形状 {test_pred.shape} (100样本 × 41类别)\")\n",
|
||
" else:\n",
|
||
" print(f\" 💡 提示: 运行验证评估需要先准备验证数据\")\n",
|
||
" \n",
|
||
" except Exception as e:\n",
|
||
" print(f\"❌ 模型加载失败: {e}\")\n",
|
||
" print(f\" 请检查文件是否完整或格式是否正确\")\n",
|
||
" trainer.model = None\n",
|
||
" \n",
|
||
"else:\n",
|
||
" print(f\"❌ 未找到模型文件: {model_path}\")\n",
|
||
" print(f\" 请确认文件路径是否正确\")\n",
|
||
" trainer.model = None\n",
|
||
"\n",
|
||
"# 显示当前训练器状态\n",
|
||
"if trainer.model is not None:\n",
|
||
" print(f\"\\n🎯 训练器状态更新:\")\n",
|
||
" print(f\" ✅ 模型: 已加载 ({trainer.model.num_trees()} 棵树)\")\n",
|
||
" print(f\" 📊 管道: {'已配置' if pipeline.pca_fitted else '需配置'}\")\n",
|
||
" print(f\" 🔬 PCA: {'已拟合' if pipeline.pca_fitted else '需拟合'}\")\n",
|
||
" \n",
|
||
" if pipeline.pca_fitted:\n",
|
||
" print(f\"\\n💡 现在可以运行:\")\n",
|
||
" print(f\" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\")\n",
|
||
" print(f\" - 预测新数据\")\n",
|
||
" print(f\" - 继续训练 (如果需要)\")\n",
|
||
" else:\n",
|
||
" print(f\"\\n⚠️ 注意: 需要先完成PCA配置才能使用模型进行预测\")\n",
|
||
"else:\n",
|
||
" print(f\"\\n❌ 模型加载失败,trainer.model = None\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# # 全量训练\n",
|
||
"\n",
|
||
"# print(\"🔥 开始智能分批训练!\")\n",
|
||
"# print(\"=\" * 80)\n",
|
||
"\n",
|
||
"# # 训练参数\n",
|
||
"# TRAINING_PARAMS = {\n",
|
||
"# 'num_boost_round': 500, # 每批次的提升轮数\n",
|
||
"# 'early_stopping_rounds': 15 # 早停轮数\n",
|
||
"# }\n",
|
||
"\n",
|
||
"# print(f\"📝 训练配置:\")\n",
|
||
"# print(f\" 训练轮数: {TRAINING_PARAMS['num_boost_round']}\")\n",
|
||
"# print(f\" 早停轮数: {TRAINING_PARAMS['early_stopping_rounds']}\")\n",
|
||
"# print(f\" 数据平衡: 启用(下采样标签0,40 + 过采样少数类)\")\n",
|
||
"# print(f\" PCA降维: 7168 → {pipeline.pca_components} 特征\")\n",
|
||
"\n",
|
||
"# print(f\"\\n🚀 启动训练...\")\n",
|
||
"\n",
|
||
"# # 开始训练\n",
|
||
"# model = trainer.train(\n",
|
||
"# num_boost_round=TRAINING_PARAMS['num_boost_round'],\n",
|
||
"# early_stopping_rounds=TRAINING_PARAMS['early_stopping_rounds']\n",
|
||
"# )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔥 开始智能分批训练!\n",
|
||
"================================================================================\n",
|
||
"📝 训练配置:\n",
|
||
" 训练轮数: 300\n",
|
||
" 早停轮数: 15\n",
|
||
" 数据平衡: 启用(下采样标签0,40 + 过采样少数类)\n",
|
||
" PCA降维: 7168 → 1219 特征\n",
|
||
"\n",
|
||
"🚀 启动训练...\n",
|
||
"\n",
|
||
"🚀 开始智能分批训练...\n",
|
||
" 📝 训练轮数 (每批次): 300\n",
|
||
" ⏹️ 早停轮数: 15\n",
|
||
"============================================================\n",
|
||
"🔄 准备验证数据...\n",
|
||
"\n",
|
||
"🔄 步骤3: 处理val数据...\n",
|
||
" 采样策略: 禁用\n",
|
||
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
|
||
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
|
||
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
|
||
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
|
||
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
|
||
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
|
||
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
|
||
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
|
||
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
|
||
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
|
||
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
|
||
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
|
||
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
|
||
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "KeyboardInterrupt",
|
||
"evalue": "",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[13], line 19\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m🚀 启动训练...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# 开始训练\u001b[39;00m\n\u001b[1;32m---> 19\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_incremental\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_boost_round\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTRAINING_PARAMS\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_boost_round\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping_rounds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTRAINING_PARAMS\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mearly_stopping_rounds\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 22\u001b[0m \u001b[43m)\u001b[49m\n",
|
||
"Cell \u001b[1;32mIn[12], line 178\u001b[0m, in \u001b[0;36mSmartBatchTrainer.train_incremental\u001b[1;34m(self, num_boost_round, early_stopping_rounds)\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m)\n\u001b[0;32m 177\u001b[0m \u001b[38;5;66;03m# 准备验证数据\u001b[39;00m\n\u001b[1;32m--> 178\u001b[0m val_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_validation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 180\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m🔄 开始分批增量训练...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 181\u001b[0m total_start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n",
|
||
"Cell \u001b[1;32mIn[12], line 59\u001b[0m, in \u001b[0;36mSmartBatchTrainer.prepare_validation_data\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;124;03m准备验证数据(仅PCA,保持原始分布)\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m🔄 准备验证数据...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 59\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_val, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39my_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep3_process_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mval\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mapply_sampling\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m无法加载验证数据\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||
"Cell \u001b[1;32mIn[9], line 80\u001b[0m, in \u001b[0;36mstep3_process_data\u001b[1;34m(self, data_type, apply_sampling)\u001b[0m\n\u001b[0;32m 77\u001b[0m all_features \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 78\u001b[0m all_labels \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m---> 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m trials_batch, filename \u001b[38;5;129;01min\u001b[39;00m load_data_batch(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata_dir, data_type, \u001b[38;5;241m3000\u001b[39m):\n\u001b[0;32m 81\u001b[0m features, labels \u001b[38;5;241m=\u001b[39m extract_features_labels_batch(trials_batch)\n\u001b[0;32m 83\u001b[0m \u001b[38;5;66;03m# 应用采样策略\u001b[39;00m\n",
|
||
"Cell \u001b[1;32mIn[5], line 30\u001b[0m, in \u001b[0;36mload_data_batch\u001b[1;34m(data_dir, data_type, max_samples_per_file)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 正在加载文件 \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_idx\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 29\u001b[0m data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mload(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(data_dir, f), allow_pickle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 30\u001b[0m trials \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mneural_logits_concatenated\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# 限制每个文件的样本数\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(trials) \u001b[38;5;241m>\u001b[39m max_samples_per_file \u001b[38;5;129;01mand\u001b[39;00m max_samples_per_file \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\_npyio_impl.py:258\u001b[0m, in \u001b[0;36mNpzFile.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m magic \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mMAGIC_PREFIX:\n\u001b[0;32m 257\u001b[0m \u001b[38;5;28mbytes\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mopen(key)\n\u001b[1;32m--> 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mbytes\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_pickle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mallow_pickle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43mpickle_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpickle_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_header_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_header_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 262\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mread(key)\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\format.py:827\u001b[0m, in \u001b[0;36mread_array\u001b[1;34m(fp, allow_pickle, pickle_kwargs, max_header_size)\u001b[0m\n\u001b[0;32m 825\u001b[0m pickle_kwargs \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 827\u001b[0m array \u001b[38;5;241m=\u001b[39m pickle\u001b[38;5;241m.\u001b[39mload(fp, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpickle_kwargs)\n\u001b[0;32m 828\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mUnicodeError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m 829\u001b[0m \u001b[38;5;66;03m# Friendlier error message\u001b[39;00m\n\u001b[0;32m 830\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mUnicodeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnpickling a python object failed: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 831\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou may need to pass the encoding= option \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 832\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto numpy.load\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (err,)) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01merr\u001b[39;00m\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:890\u001b[0m, in \u001b[0;36mZipExtFile.peek\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 888\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns buffered bytes without advancing the position.\"\"\"\u001b[39;00m\n\u001b[0;32m 889\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:\n\u001b[1;32m--> 890\u001b[0m chunk \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(chunk) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:\n\u001b[0;32m 892\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer \u001b[38;5;241m=\u001b[39m chunk \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:]\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:930\u001b[0m, in \u001b[0;36mZipExtFile.read\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m 929\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m n \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof:\n\u001b[1;32m--> 930\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 931\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mlen\u001b[39m(data):\n\u001b[0;32m 932\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer \u001b[38;5;241m=\u001b[39m data\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:1006\u001b[0m, in \u001b[0;36mZipExtFile._read1\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compress_type \u001b[38;5;241m==\u001b[39m ZIP_DEFLATED:\n\u001b[0;32m 1005\u001b[0m n \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(n, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mMIN_READ_SIZE)\n\u001b[1;32m-> 1006\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decompressor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecompress\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1007\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decompressor\u001b[38;5;241m.\u001b[39meof \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1008\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compress_left \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[0;32m 1009\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decompressor\u001b[38;5;241m.\u001b[39munconsumed_tail)\n\u001b[0;32m 1010\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof:\n",
|
||
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔥 开始智能分批训练!\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"# 训练参数\n",
|
||
"TRAINING_PARAMS = {\n",
|
||
" 'num_boost_round': 300, # 每批次的提升轮数\n",
|
||
" 'early_stopping_rounds': 15 # 早停轮数\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📝 训练配置:\")\n",
|
||
"print(f\" 训练轮数: {TRAINING_PARAMS['num_boost_round']}\")\n",
|
||
"print(f\" 早停轮数: {TRAINING_PARAMS['early_stopping_rounds']}\")\n",
|
||
"print(f\" 数据平衡: 启用(下采样标签0,40 + 过采样少数类)\")\n",
|
||
"print(f\" PCA降维: 7168 → {pipeline.pca_components} 特征\")\n",
|
||
"\n",
|
||
"print(f\"\\n🚀 启动训练...\")\n",
|
||
"\n",
|
||
"# 开始训练\n",
|
||
"model = trainer.train_incremental(\n",
|
||
" num_boost_round=TRAINING_PARAMS['num_boost_round'],\n",
|
||
" early_stopping_rounds=TRAINING_PARAMS['early_stopping_rounds']\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 📊 训练结果分析"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 🧪 模型性能评估"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🧪 开始模型性能评估...\n",
|
||
"🧪 评估模型在val数据集上的性能...\n",
|
||
"\n",
|
||
"🔄 步骤3: 处理val数据...\n",
|
||
" 采样策略: 禁用\n",
|
||
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
|
||
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
|
||
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
|
||
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
|
||
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
|
||
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
|
||
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
|
||
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
|
||
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
|
||
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
|
||
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
|
||
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
|
||
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
|
||
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n",
|
||
" 正在加载文件 15/41: t15.2023.10.20_val_concatenated.npz\n",
|
||
" 正在加载文件 16/41: t15.2023.10.22_val_concatenated.npz\n",
|
||
" 正在加载文件 17/41: t15.2023.11.03_val_concatenated.npz\n",
|
||
" 正在加载文件 18/41: t15.2023.11.04_val_concatenated.npz\n",
|
||
" 正在加载文件 19/41: t15.2023.11.17_val_concatenated.npz\n",
|
||
" 正在加载文件 20/41: t15.2023.11.19_val_concatenated.npz\n",
|
||
" 正在加载文件 21/41: t15.2023.11.26_val_concatenated.npz\n",
|
||
" 正在加载文件 22/41: t15.2023.12.03_val_concatenated.npz\n",
|
||
" 正在加载文件 23/41: t15.2023.12.08_val_concatenated.npz\n",
|
||
" 正在加载文件 24/41: t15.2023.12.10_val_concatenated.npz\n",
|
||
" 正在加载文件 25/41: t15.2023.12.17_val_concatenated.npz\n",
|
||
" 正在加载文件 26/41: t15.2023.12.29_val_concatenated.npz\n",
|
||
" 正在加载文件 27/41: t15.2024.02.25_val_concatenated.npz\n",
|
||
" 正在加载文件 28/41: t15.2024.03.08_val_concatenated.npz\n",
|
||
" 正在加载文件 29/41: t15.2024.03.15_val_concatenated.npz\n",
|
||
" 正在加载文件 30/41: t15.2024.03.17_val_concatenated.npz\n",
|
||
" 正在加载文件 31/41: t15.2024.05.10_val_concatenated.npz\n",
|
||
" 正在加载文件 32/41: t15.2024.06.14_val_concatenated.npz\n",
|
||
" 正在加载文件 33/41: t15.2024.07.19_val_concatenated.npz\n",
|
||
" 正在加载文件 34/41: t15.2024.07.21_val_concatenated.npz\n",
|
||
" 正在加载文件 35/41: t15.2024.07.28_val_concatenated.npz\n",
|
||
" 正在加载文件 36/41: t15.2025.01.10_val_concatenated.npz\n",
|
||
" 正在加载文件 37/41: t15.2025.01.12_val_concatenated.npz\n",
|
||
" 正在加载文件 38/41: t15.2025.03.14_val_concatenated.npz\n",
|
||
" 正在加载文件 39/41: t15.2025.03.16_val_concatenated.npz\n",
|
||
" 正在加载文件 40/41: t15.2025.03.30_val_concatenated.npz\n",
|
||
" 正在加载文件 41/41: t15.2025.04.13_val_concatenated.npz\n",
|
||
" ✅ 处理完成: 321,773 样本, 1219 特征\n",
|
||
" 📊 数据集大小: 321,773 样本, 1219 特征\n",
|
||
" ⏱️ 预测时间: 377.15秒\n",
|
||
" 🎯 整体准确率: 0.6695\n",
|
||
"\n",
|
||
"📊 标签分布对比:\n",
|
||
"标签 | 真实数量 | 预测数量 | 准确率\n",
|
||
"----------------------------------------\n",
|
||
" 0 | 238,705 | 216,091 | 0.806\n",
|
||
" 1 | 707 | 241 | 0.008\n",
|
||
" 2 | 787 | 238 | 0.009\n",
|
||
" 3 | 4,019 | 1,203 | 0.023\n",
|
||
" 4 | 612 | 286 | 0.026\n",
|
||
" 5 | 280 | 13 | 0.007\n",
|
||
" 6 | 1,102 | 2,519 | 0.191\n",
|
||
" 7 | 708 | 391 | 0.020\n",
|
||
" 8 | 257 | 10 | 0.000\n",
|
||
" 9 | 2,072 | 1,548 | 0.037\n",
|
||
" 10 | 1,562 | 1,744 | 0.111\n",
|
||
" 11 | 1,012 | 600 | 0.039\n",
|
||
" 12 | 991 | 356 | 0.008\n",
|
||
" 13 | 621 | 200 | 0.011\n",
|
||
" 14 | 818 | 408 | 0.013\n",
|
||
" 15 | 451 | 202 | 0.007\n",
|
||
" 16 | 794 | 3,307 | 0.174\n",
|
||
" 17 | 2,698 | 1,757 | 0.039\n",
|
||
" 18 | 1,926 | 2,019 | 0.046\n",
|
||
" 19 | 274 | 11 | 0.000\n",
|
||
" 20 | 1,676 | 2,640 | 0.063\n",
|
||
" 21 | 2,425 | 3,135 | 0.091\n",
|
||
" 22 | 1,354 | 1,625 | 0.061\n",
|
||
" 23 | 2,440 | 1,076 | 0.017\n",
|
||
" 24 | 656 | 143 | 0.009\n",
|
||
" 25 | 474 | 99 | 0.004\n",
|
||
" 26 | 221 | 4 | 0.000\n",
|
||
" 27 | 1,102 | 1,437 | 0.083\n",
|
||
" 28 | 2,416 | 4,056 | 0.119\n",
|
||
" 29 | 3,002 | 2,222 | 0.046\n",
|
||
" 30 | 251 | 10 | 0.004\n",
|
||
" 31 | 4,039 | 8,386 | 0.122\n",
|
||
" 32 | 413 | 142 | 0.046\n",
|
||
" 33 | 185 | 1 | 0.000\n",
|
||
" 34 | 1,028 | 1,062 | 0.090\n",
|
||
" 35 | 820 | 162 | 0.017\n",
|
||
" 36 | 1,310 | 290 | 0.015\n",
|
||
" 37 | 609 | 841 | 0.128\n",
|
||
" 38 | 1,429 | 1,365 | 0.082\n",
|
||
" 39 | 102 | 3 | 0.000\n",
|
||
" 40 | 35,425 | 59,930 | 0.567\n",
|
||
"\n",
|
||
"🔍 关键标签性能分析:\n",
|
||
" 标签 0 (下采样目标): 准确率 0.8065, 样本数 238,705\n",
|
||
" 标签 40 (下采样目标): 准确率 0.5673, 样本数 35,425\n",
|
||
" 少数类平均准确率 (前5个): 0.0000\n",
|
||
"\n",
|
||
"📈 预测置信度分析:\n",
|
||
" 平均置信度: 0.6461\n",
|
||
" 置信度中位数: 0.6850\n",
|
||
" 高置信度预测 (>0.9): 104,643 / 321,773 (32.52%)\n",
|
||
"\n",
|
||
"============================================================\n",
|
||
"🎉 智能分批训练+数据平衡 评估完成!\n",
|
||
"✅ 实现了数据平衡和PCA降维的完整流程\n",
|
||
"✅ 使用了内存友好的分批训练策略\n",
|
||
"✅ 保持了验证集的原始分布以确保评估客观性\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 🧪 模型性能评估\n",
|
||
"\n",
|
||
"from sklearn.metrics import classification_report, confusion_matrix\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"def evaluate_model_performance(model, pipeline, data_type='val'):\n",
|
||
" \"\"\"\n",
|
||
" 评估模型在指定数据集上的性能\n",
|
||
" \"\"\"\n",
|
||
" print(f\"🧪 评估模型在{data_type}数据集上的性能...\")\n",
|
||
" \n",
|
||
" # 加载数据\n",
|
||
" X, y = pipeline.step3_process_data(data_type, apply_sampling=False)\n",
|
||
" \n",
|
||
" if X is None or y is None:\n",
|
||
" print(f\"❌ 无法加载{data_type}数据\")\n",
|
||
" return None\n",
|
||
" \n",
|
||
" print(f\" 📊 数据集大小: {X.shape[0]:,} 样本, {X.shape[1]} 特征\")\n",
|
||
" \n",
|
||
" # 预测\n",
|
||
" start_time = time.time()\n",
|
||
" y_pred_proba = model.predict(X)\n",
|
||
" y_pred = y_pred_proba.argmax(axis=1)\n",
|
||
" pred_time = time.time() - start_time\n",
|
||
" \n",
|
||
" # 计算性能指标\n",
|
||
" accuracy = (y_pred == y).mean()\n",
|
||
" \n",
|
||
" print(f\" ⏱️ 预测时间: {pred_time:.2f}秒\")\n",
|
||
" print(f\" 🎯 整体准确率: {accuracy:.4f}\")\n",
|
||
" \n",
|
||
" # 分析各类别性能\n",
|
||
" from collections import Counter\n",
|
||
" true_counts = Counter(y)\n",
|
||
" pred_counts = Counter(y_pred)\n",
|
||
" \n",
|
||
" print(f\"\\n📊 标签分布对比:\")\n",
|
||
" print(\"标签 | 真实数量 | 预测数量 | 准确率\")\n",
|
||
" print(\"-\" * 40)\n",
|
||
" \n",
|
||
" label_accuracies = {}\n",
|
||
" for label in range(41):\n",
|
||
" if label in true_counts:\n",
|
||
" label_mask = (y == label)\n",
|
||
" if label_mask.sum() > 0:\n",
|
||
" label_acc = (y_pred[label_mask] == label).mean()\n",
|
||
" label_accuracies[label] = label_acc\n",
|
||
" true_count = true_counts.get(label, 0)\n",
|
||
" pred_count = pred_counts.get(label, 0)\n",
|
||
" print(f\"{label:4d} | {true_count:8,} | {pred_count:8,} | {label_acc:7.3f}\")\n",
|
||
" \n",
|
||
" # 重点分析关键标签\n",
|
||
" print(f\"\\n🔍 关键标签性能分析:\")\n",
|
||
" key_labels = [0, 40] # 下采样的标签\n",
|
||
" for label in key_labels:\n",
|
||
" if label in label_accuracies:\n",
|
||
" acc = label_accuracies[label]\n",
|
||
" count = true_counts.get(label, 0)\n",
|
||
" print(f\" 标签 {label} (下采样目标): 准确率 {acc:.4f}, 样本数 {count:,}\")\n",
|
||
" \n",
|
||
" # 少数类性能\n",
|
||
" minority_labels = [label for label, count in true_counts.items() \n",
|
||
" if count < 200 and label not in [0, 40]]\n",
|
||
" if minority_labels:\n",
|
||
" minority_accs = [label_accuracies.get(label, 0) for label in minority_labels[:5]]\n",
|
||
" avg_minority_acc = np.mean(minority_accs) if minority_accs else 0\n",
|
||
" print(f\" 少数类平均准确率 (前5个): {avg_minority_acc:.4f}\")\n",
|
||
" \n",
|
||
" # 置信度分析\n",
|
||
" max_proba = y_pred_proba.max(axis=1)\n",
|
||
" print(f\"\\n📈 预测置信度分析:\")\n",
|
||
" print(f\" 平均置信度: {max_proba.mean():.4f}\")\n",
|
||
" print(f\" 置信度中位数: {np.median(max_proba):.4f}\")\n",
|
||
" print(f\" 高置信度预测 (>0.9): {(max_proba > 0.9).sum():,} / {len(max_proba):,} ({(max_proba > 0.9).mean():.2%})\")\n",
|
||
" \n",
|
||
" return {\n",
|
||
" 'accuracy': accuracy,\n",
|
||
" 'prediction_time': pred_time,\n",
|
||
" 'label_accuracies': label_accuracies,\n",
|
||
" 'confidence_stats': {\n",
|
||
" 'mean': max_proba.mean(),\n",
|
||
" 'median': np.median(max_proba),\n",
|
||
" 'high_confidence_ratio': (max_proba > 0.9).mean()\n",
|
||
" }\n",
|
||
" }\n",
|
||
"\n",
|
||
"# 评估模型性能\n",
|
||
"if trainer.model:\n",
|
||
" print(\"🧪 开始模型性能评估...\")\n",
|
||
" \n",
|
||
" # 验证集评估\n",
|
||
" val_results = evaluate_model_performance(trainer.model, pipeline, 'val')\n",
|
||
" \n",
|
||
" print(f\"\\n\" + \"=\"*60)\n",
|
||
" print(\"🎉 智能分批训练+数据平衡 评估完成!\")\n",
|
||
" print(f\"✅ 实现了数据平衡和PCA降维的完整流程\")\n",
|
||
" print(f\"✅ 使用了内存友好的分批训练策略\")\n",
|
||
" print(f\"✅ 保持了验证集的原始分布以确保评估客观性\")\n",
|
||
"else:\n",
|
||
" print(\"❌ 模型尚未训练完成,请等待训练结束后运行此评估\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔍 开始在少量数据集上进行参数搜索...\n",
|
||
"================================================================================\n",
|
||
"📊 参数搜索配置:\n",
|
||
" 数据采样比例: 10.0%\n",
|
||
" 交叉验证折数: 3\n",
|
||
" 最大参数组合: 20\n",
|
||
" 参数空间大小: 729 种组合\n",
|
||
"\n",
|
||
"📦 准备少量数据集...\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "AttributeError",
|
||
"evalue": "'SmartDataPipeline' object has no attribute 'get_sample_data'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[19], line 37\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m📦 准备少量数据集...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# 使用已有的pipeline获取数据\u001b[39;00m\n\u001b[1;32m---> 37\u001b[0m sample_X, sample_y \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_sample_data\u001b[49m(ratio\u001b[38;5;241m=\u001b[39mPARAM_SEARCH_CONFIG[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msample_ratio\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 样本数量: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(sample_X)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 特征维度: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msample_X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mhasattr\u001b[39m(sample_X,\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnknown\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
|
||
"\u001b[1;31mAttributeError\u001b[0m: 'SmartDataPipeline' object has no attribute 'get_sample_data'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔍 开始在少量数据集上进行参数搜索...\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"import itertools\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"import lightgbm as lgb\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# 参数搜索配置\n",
|
||
"PARAM_SEARCH_CONFIG = {\n",
|
||
" 'sample_ratio': 0.1, # 使用10%的数据进行参数搜索\n",
|
||
" 'cv_folds': 3, # 3折交叉验证\n",
|
||
" 'max_combinations': 20 # 最多测试20种参数组合\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 定义参数搜索空间\n",
|
||
"param_grid = {\n",
|
||
" 'learning_rate': [0.05, 0.1, 0.15],\n",
|
||
" 'num_leaves': [31, 63, 127],\n",
|
||
" 'feature_fraction': [0.8, 0.9, 1.0],\n",
|
||
" 'bagging_fraction': [0.8, 0.9, 1.0],\n",
|
||
" 'max_depth': [6, 8, 10],\n",
|
||
" 'min_data_in_leaf': [10, 20, 30]\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📊 参数搜索配置:\")\n",
|
||
"print(f\" 数据采样比例: {PARAM_SEARCH_CONFIG['sample_ratio']*100}%\")\n",
|
||
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
|
||
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
|
||
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
|
||
"\n",
|
||
"# 获取少量数据用于参数搜索\n",
|
||
"print(f\"\\n📦 准备少量数据集...\")\n",
|
||
"\n",
|
||
"# 从验证集获取少量数据进行参数搜索\n",
|
||
"import os\n",
|
||
"data_dir = 'data/concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 只使用前几个文件进行快速参数搜索\n",
|
||
"sample_files = val_files[:max(1, int(len(val_files) * PARAM_SEARCH_CONFIG['sample_ratio']))]\n",
|
||
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
|
||
"\n",
|
||
"# 加载样本数据\n",
|
||
"sample_X_list = []\n",
|
||
"sample_y_list = []\n",
|
||
"\n",
|
||
"for file in sample_files[:3]: # 最多使用3个文件\n",
|
||
" file_path = os.path.join(data_dir, file)\n",
|
||
" try:\n",
|
||
" data = np.load(file_path)\n",
|
||
" features = data['features']\n",
|
||
" labels = data['labels']\n",
|
||
" \n",
|
||
" # 进一步采样以减少数据量\n",
|
||
" n_samples = min(2000, len(features)) # 每个文件最多2000样本\n",
|
||
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
|
||
" \n",
|
||
" sample_X_list.append(features[indices])\n",
|
||
" sample_y_list.append(labels[indices])\n",
|
||
" \n",
|
||
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
|
||
"\n",
|
||
"if len(sample_X_list) == 0:\n",
|
||
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
|
||
"\n",
|
||
"# 合并数据\n",
|
||
"sample_X = np.vstack(sample_X_list)\n",
|
||
"sample_y = np.hstack(sample_y_list)\n",
|
||
"\n",
|
||
"# 应用PCA变换\n",
|
||
"if hasattr(pipeline, 'pca_components') and GLOBAL_PCA['is_fitted']:\n",
|
||
" sample_X = apply_pca_transform(sample_X)\n",
|
||
"\n",
|
||
"print(f\" 总样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
|
||
"print(f\" 标签分布: {np.bincount(sample_y)[:5]}... (前5个标签)\")\n",
|
||
"\n",
|
||
"# 生成参数组合\n",
|
||
"print(f\"\\n🎯 生成参数组合...\")\n",
|
||
"param_names = list(param_grid.keys())\n",
|
||
"param_values = list(param_grid.values())\n",
|
||
"\n",
|
||
"# 随机采样参数组合\n",
|
||
"np.random.seed(42)\n",
|
||
"all_combinations = list(itertools.product(*param_values))\n",
|
||
"np.random.shuffle(all_combinations)\n",
|
||
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
|
||
"\n",
|
||
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
|
||
"\n",
|
||
"# 参数搜索函数\n",
|
||
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
|
||
" \"\"\"评估参数组合的性能\"\"\"\n",
|
||
" try:\n",
|
||
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
|
||
" scores = []\n",
|
||
" \n",
|
||
" for train_idx, val_idx in skf.split(X, y):\n",
|
||
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
|
||
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
|
||
" \n",
|
||
" # 创建LightGBM数据集\n",
|
||
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
|
||
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" model = lgb.train(\n",
|
||
" params_dict,\n",
|
||
" train_data,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" num_boost_round=50, # 少量轮数快速评估\n",
|
||
" callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]\n",
|
||
" )\n",
|
||
" \n",
|
||
" # 预测和评估\n",
|
||
" y_pred = model.predict(X_val_fold)\n",
|
||
" y_pred_binary = (y_pred > 0.5).astype(int)\n",
|
||
" score = accuracy_score(y_val_fold, y_pred_binary)\n",
|
||
" scores.append(score)\n",
|
||
" \n",
|
||
" return np.mean(scores), np.std(scores)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
|
||
" return 0.0, 1.0\n",
|
||
"\n",
|
||
"# 开始参数搜索\n",
|
||
"print(f\"\\n🚀 开始参数搜索...\")\n",
|
||
"best_score = 0\n",
|
||
"best_params = None\n",
|
||
"best_std = 1.0\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"for i, combination in enumerate(selected_combinations):\n",
|
||
" params_dict = dict(zip(param_names, combination))\n",
|
||
" \n",
|
||
" # 添加固定参数\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'binary',\n",
|
||
" 'metric': 'binary_logloss',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
|
||
" print(f\" 参数: {params_dict}\")\n",
|
||
" \n",
|
||
" # 评估参数\n",
|
||
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y, PARAM_SEARCH_CONFIG['cv_folds'])\n",
|
||
" \n",
|
||
" results.append({\n",
|
||
" 'params': params_dict.copy(),\n",
|
||
" 'mean_score': mean_score,\n",
|
||
" 'std_score': std_score\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
|
||
" \n",
|
||
" # 更新最佳参数\n",
|
||
" if mean_score > best_score:\n",
|
||
" best_score = mean_score\n",
|
||
" best_params = params_dict.copy()\n",
|
||
" best_std = std_score\n",
|
||
" print(f\" ✨ 新的最佳参数!\")\n",
|
||
"\n",
|
||
"print(f\"\\n🏆 参数搜索完成!\")\n",
|
||
"print(f\"=\" * 80)\n",
|
||
"print(f\"🎯 最佳参数组合:\")\n",
|
||
"for key, value in best_params.items():\n",
|
||
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']:\n",
|
||
" print(f\" {key}: {value}\")\n",
|
||
"\n",
|
||
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
|
||
"\n",
|
||
"# 保存结果\n",
|
||
"print(f\"\\n💾 保存参数搜索结果...\")\n",
|
||
"BEST_PARAMS = best_params\n",
|
||
"PARAM_SEARCH_RESULTS = results\n",
|
||
"\n",
|
||
"print(f\" 最佳参数已保存到变量: BEST_PARAMS\")\n",
|
||
"print(f\" 所有结果已保存到变量: PARAM_SEARCH_RESULTS\")\n",
|
||
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
|
||
"\n",
|
||
"# 显示前5个最佳结果\n",
|
||
"print(f\"\\n🔝 Top 5 参数组合:\")\n",
|
||
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
|
||
"for i, result in enumerate(sorted_results[:5]):\n",
|
||
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
|
||
" key_params = {k: v for k, v in result['params'].items() \n",
|
||
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
"\n",
|
||
"print(f\"\\n✅ 参数搜索完成!可以使用 BEST_PARAMS 进行后续训练\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔍 重新开始参数搜索(修复版)...\n",
|
||
"================================================================================\n",
|
||
"📊 参数搜索配置:\n",
|
||
" 数据采样比例: 10.0%\n",
|
||
" 交叉验证折数: 3\n",
|
||
" 最大参数组合: 10\n",
|
||
" 参数空间大小: 48 种组合\n",
|
||
"\n",
|
||
"📦 准备少量数据集...\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "FileNotFoundError",
|
||
"evalue": "[WinError 3] 系统找不到指定的路径。: 'data/concatenated_data'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[20], line 38\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# 从验证集获取少量数据进行参数搜索\u001b[39;00m\n\u001b[0;32m 37\u001b[0m data_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/concatenated_data\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m---> 38\u001b[0m val_files \u001b[38;5;241m=\u001b[39m [f \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m f\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_val_concatenated.npz\u001b[39m\u001b[38;5;124m'\u001b[39m)]\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# 只使用前2个文件进行快速参数搜索\u001b[39;00m\n\u001b[0;32m 41\u001b[0m sample_files \u001b[38;5;241m=\u001b[39m val_files[:\u001b[38;5;241m2\u001b[39m]\n",
|
||
"\u001b[1;31mFileNotFoundError\u001b[0m: [WinError 3] 系统找不到指定的路径。: 'data/concatenated_data'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔍 重新开始参数搜索(修复版)...\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"import itertools\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"import lightgbm as lgb\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 参数搜索配置\n",
|
||
"PARAM_SEARCH_CONFIG = {\n",
|
||
" 'sample_ratio': 0.1, # 使用10%的数据进行参数搜索\n",
|
||
" 'cv_folds': 3, # 3折交叉验证\n",
|
||
" 'max_combinations': 10 # 减少到10种参数组合以加快速度\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 定义参数搜索空间(简化版)\n",
|
||
"param_grid = {\n",
|
||
" 'learning_rate': [0.05, 0.1, 0.15],\n",
|
||
" 'num_leaves': [31, 63],\n",
|
||
" 'feature_fraction': [0.8, 1.0],\n",
|
||
" 'bagging_fraction': [0.8, 1.0],\n",
|
||
" 'max_depth': [6, 8],\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📊 参数搜索配置:\")\n",
|
||
"print(f\" 数据采样比例: {PARAM_SEARCH_CONFIG['sample_ratio']*100}%\")\n",
|
||
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
|
||
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
|
||
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
|
||
"\n",
|
||
"# 获取少量数据用于参数搜索\n",
|
||
"print(f\"\\n📦 准备少量数据集...\")\n",
|
||
"\n",
|
||
"# 从验证集获取少量数据进行参数搜索\n",
|
||
"data_dir = 'data/concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 只使用前2个文件进行快速参数搜索\n",
|
||
"sample_files = val_files[:2]\n",
|
||
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
|
||
"\n",
|
||
"# 加载样本数据\n",
|
||
"sample_X_list = []\n",
|
||
"sample_y_list = []\n",
|
||
"\n",
|
||
"for file in sample_files:\n",
|
||
" file_path = os.path.join(data_dir, file)\n",
|
||
" try:\n",
|
||
" data = np.load(file_path)\n",
|
||
" features = data['features']\n",
|
||
" labels = data['labels']\n",
|
||
" \n",
|
||
" # 进一步采样以减少数据量\n",
|
||
" n_samples = min(1000, len(features)) # 每个文件最多1000样本\n",
|
||
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
|
||
" \n",
|
||
" sample_X_list.append(features[indices])\n",
|
||
" sample_y_list.append(labels[indices])\n",
|
||
" \n",
|
||
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
|
||
"\n",
|
||
"if len(sample_X_list) == 0:\n",
|
||
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
|
||
"\n",
|
||
"# 合并数据\n",
|
||
"sample_X = np.vstack(sample_X_list)\n",
|
||
"sample_y = np.hstack(sample_y_list)\n",
|
||
"\n",
|
||
"# 应用PCA变换(如果已经拟合)\n",
|
||
"if GLOBAL_PCA['is_fitted']:\n",
|
||
" sample_X = apply_pca_transform(sample_X)\n",
|
||
"\n",
|
||
"print(f\" 总样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
|
||
"\n",
|
||
"# 只保留前3个类别以简化问题\n",
|
||
"mask = sample_y < 3\n",
|
||
"sample_X = sample_X[mask]\n",
|
||
"sample_y = sample_y[mask]\n",
|
||
"\n",
|
||
"print(f\" 简化后样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 标签分布: {np.bincount(sample_y)}\")\n",
|
||
"\n",
|
||
"# 生成参数组合\n",
|
||
"print(f\"\\n🎯 生成参数组合...\")\n",
|
||
"param_names = list(param_grid.keys())\n",
|
||
"param_values = list(param_grid.values())\n",
|
||
"\n",
|
||
"# 随机采样参数组合\n",
|
||
"np.random.seed(42)\n",
|
||
"all_combinations = list(itertools.product(*param_values))\n",
|
||
"np.random.shuffle(all_combinations)\n",
|
||
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
|
||
"\n",
|
||
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
|
||
"\n",
|
||
"# 参数搜索函数\n",
|
||
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
|
||
" \"\"\"评估参数组合的性能\"\"\"\n",
|
||
" try:\n",
|
||
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
|
||
" scores = []\n",
|
||
" \n",
|
||
" for train_idx, val_idx in skf.split(X, y):\n",
|
||
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
|
||
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
|
||
" \n",
|
||
" # 创建LightGBM数据集\n",
|
||
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
|
||
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" model = lgb.train(\n",
|
||
" params_dict,\n",
|
||
" train_data,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" num_boost_round=50, # 少量轮数快速评估\n",
|
||
" callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]\n",
|
||
" )\n",
|
||
" \n",
|
||
" # 预测和评估\n",
|
||
" y_pred = model.predict(X_val_fold)\n",
|
||
" if len(np.unique(y)) > 2: # 多分类\n",
|
||
" y_pred_class = np.argmax(y_pred, axis=1)\n",
|
||
" else: # 二分类\n",
|
||
" y_pred_class = (y_pred > 0.5).astype(int)\n",
|
||
" \n",
|
||
" score = accuracy_score(y_val_fold, y_pred_class)\n",
|
||
" scores.append(score)\n",
|
||
" \n",
|
||
" return np.mean(scores), np.std(scores)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
|
||
" return 0.0, 1.0\n",
|
||
"\n",
|
||
"# 开始参数搜索\n",
|
||
"print(f\"\\n🚀 开始参数搜索...\")\n",
|
||
"best_score = 0\n",
|
||
"best_params = None\n",
|
||
"best_std = 1.0\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"for i, combination in enumerate(selected_combinations):\n",
|
||
" params_dict = dict(zip(param_names, combination))\n",
|
||
" \n",
|
||
" # 添加固定参数\n",
|
||
" if len(np.unique(sample_y)) > 2:\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'multiclass',\n",
|
||
" 'num_class': len(np.unique(sample_y)),\n",
|
||
" 'metric': 'multi_logloss',\n",
|
||
" })\n",
|
||
" else:\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'binary',\n",
|
||
" 'metric': 'binary_logloss',\n",
|
||
" })\n",
|
||
" \n",
|
||
" params_dict.update({\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
|
||
" key_params = {k: v for k, v in params_dict.items() \n",
|
||
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']}\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
" \n",
|
||
" # 评估参数\n",
|
||
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y, PARAM_SEARCH_CONFIG['cv_folds'])\n",
|
||
" \n",
|
||
" results.append({\n",
|
||
" 'params': params_dict.copy(),\n",
|
||
" 'mean_score': mean_score,\n",
|
||
" 'std_score': std_score\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
|
||
" \n",
|
||
" # 更新最佳参数\n",
|
||
" if mean_score > best_score:\n",
|
||
" best_score = mean_score\n",
|
||
" best_params = params_dict.copy()\n",
|
||
" best_std = std_score\n",
|
||
" print(f\" ✨ 新的最佳参数!\")\n",
|
||
"\n",
|
||
"print(f\"\\n🏆 参数搜索完成!\")\n",
|
||
"print(f\"=\" * 80)\n",
|
||
"print(f\"🎯 最佳参数组合:\")\n",
|
||
"for key, value in best_params.items():\n",
|
||
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']:\n",
|
||
" print(f\" {key}: {value}\")\n",
|
||
"\n",
|
||
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
|
||
"\n",
|
||
"# 保存结果\n",
|
||
"BEST_PARAMS = best_params\n",
|
||
"PARAM_SEARCH_RESULTS = results\n",
|
||
"\n",
|
||
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
|
||
"print(f\" 最佳参数变量: BEST_PARAMS\")\n",
|
||
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS\")\n",
|
||
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
|
||
"\n",
|
||
"# 显示前5个最佳结果\n",
|
||
"print(f\"\\n🔝 Top 5 参数组合:\")\n",
|
||
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
|
||
"for i, result in enumerate(sorted_results[:5]):\n",
|
||
" key_params = {k: v for k, v in result['params'].items() \n",
|
||
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']}\n",
|
||
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
"\n",
|
||
"print(f\"\\n✅ 参数搜索完成!可以使用 BEST_PARAMS 进行后续训练\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔍 参数搜索(使用正确路径)...\n",
|
||
"================================================================================\n",
|
||
"📊 参数搜索配置:\n",
|
||
" 交叉验证折数: 3\n",
|
||
" 最大参数组合: 8\n",
|
||
" 参数空间大小: 16 种组合\n",
|
||
"\n",
|
||
"📦 准备少量数据集...\n",
|
||
" 使用文件数: 2 / 41\n",
|
||
" ⚠️ 加载失败: t15.2023.08.13_val_concatenated.npz - 'features is not a file in the archive'\n",
|
||
" ⚠️ 加载失败: t15.2023.08.18_val_concatenated.npz - 'features is not a file in the archive'\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "无法加载任何数据文件进行参数搜索",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[21], line 64\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m ⚠️ 加载失败: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sample_X_list) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m---> 64\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m无法加载任何数据文件进行参数搜索\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 66\u001b[0m \u001b[38;5;66;03m# 合并数据\u001b[39;00m\n\u001b[0;32m 67\u001b[0m sample_X \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(sample_X_list)\n",
|
||
"\u001b[1;31mValueError\u001b[0m: 无法加载任何数据文件进行参数搜索"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔍 参数搜索(使用正确路径)...\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"import itertools\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"import lightgbm as lgb\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 参数搜索配置\n",
|
||
"PARAM_SEARCH_CONFIG = {\n",
|
||
" 'cv_folds': 3, # 3折交叉验证\n",
|
||
" 'max_combinations': 8 # 减少到8种参数组合以加快速度\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 定义参数搜索空间(简化版)\n",
|
||
"param_grid = {\n",
|
||
" 'learning_rate': [0.05, 0.1],\n",
|
||
" 'num_leaves': [31, 63],\n",
|
||
" 'feature_fraction': [0.8, 1.0],\n",
|
||
" 'max_depth': [6, 8],\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📊 参数搜索配置:\")\n",
|
||
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
|
||
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
|
||
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
|
||
"\n",
|
||
"# 获取少量数据用于参数搜索\n",
|
||
"print(f\"\\n📦 准备少量数据集...\")\n",
|
||
"\n",
|
||
"# 使用绝对路径\n",
|
||
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 只使用前2个文件进行快速参数搜索\n",
|
||
"sample_files = val_files[:2]\n",
|
||
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
|
||
"\n",
|
||
"# 加载样本数据\n",
|
||
"sample_X_list = []\n",
|
||
"sample_y_list = []\n",
|
||
"\n",
|
||
"for file in sample_files:\n",
|
||
" file_path = os.path.join(data_dir, file)\n",
|
||
" try:\n",
|
||
" data = np.load(file_path)\n",
|
||
" features = data['features']\n",
|
||
" labels = data['labels']\n",
|
||
" \n",
|
||
" # 进一步采样以减少数据量\n",
|
||
" n_samples = min(500, len(features)) # 每个文件最多500样本\n",
|
||
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
|
||
" \n",
|
||
" sample_X_list.append(features[indices])\n",
|
||
" sample_y_list.append(labels[indices])\n",
|
||
" \n",
|
||
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
|
||
"\n",
|
||
"if len(sample_X_list) == 0:\n",
|
||
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
|
||
"\n",
|
||
"# 合并数据\n",
|
||
"sample_X = np.vstack(sample_X_list)\n",
|
||
"sample_y = np.hstack(sample_y_list)\n",
|
||
"\n",
|
||
"# 应用PCA变换(如果已经拟合)\n",
|
||
"if GLOBAL_PCA['is_fitted']:\n",
|
||
" sample_X = apply_pca_transform(sample_X)\n",
|
||
"\n",
|
||
"print(f\" 总样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
|
||
"\n",
|
||
"# 简化为二分类问题:标签0 vs 其他\n",
|
||
"sample_y_binary = (sample_y == 0).astype(int)\n",
|
||
"print(f\" 二分类标签分布: {np.bincount(sample_y_binary)}\")\n",
|
||
"\n",
|
||
"# 生成参数组合\n",
|
||
"print(f\"\\n🎯 生成参数组合...\")\n",
|
||
"param_names = list(param_grid.keys())\n",
|
||
"param_values = list(param_grid.values())\n",
|
||
"\n",
|
||
"# 随机采样参数组合\n",
|
||
"np.random.seed(42)\n",
|
||
"all_combinations = list(itertools.product(*param_values))\n",
|
||
"np.random.shuffle(all_combinations)\n",
|
||
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
|
||
"\n",
|
||
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
|
||
"\n",
|
||
"# 参数搜索函数\n",
|
||
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
|
||
" \"\"\"评估参数组合的性能\"\"\"\n",
|
||
" try:\n",
|
||
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
|
||
" scores = []\n",
|
||
" \n",
|
||
" for train_idx, val_idx in skf.split(X, y):\n",
|
||
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
|
||
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
|
||
" \n",
|
||
" # 创建LightGBM数据集\n",
|
||
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
|
||
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" model = lgb.train(\n",
|
||
" params_dict,\n",
|
||
" train_data,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" num_boost_round=30, # 少量轮数快速评估\n",
|
||
" callbacks=[lgb.early_stopping(5), lgb.log_evaluation(0)]\n",
|
||
" )\n",
|
||
" \n",
|
||
" # 预测和评估\n",
|
||
" y_pred = model.predict(X_val_fold)\n",
|
||
" y_pred_class = (y_pred > 0.5).astype(int)\n",
|
||
" \n",
|
||
" score = accuracy_score(y_val_fold, y_pred_class)\n",
|
||
" scores.append(score)\n",
|
||
" \n",
|
||
" return np.mean(scores), np.std(scores)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
|
||
" return 0.0, 1.0\n",
|
||
"\n",
|
||
"# 开始参数搜索\n",
|
||
"print(f\"\\n🚀 开始参数搜索...\")\n",
|
||
"best_score = 0\n",
|
||
"best_params = None\n",
|
||
"best_std = 1.0\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"for i, combination in enumerate(selected_combinations):\n",
|
||
" params_dict = dict(zip(param_names, combination))\n",
|
||
" \n",
|
||
" # 添加固定参数\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'binary',\n",
|
||
" 'metric': 'binary_logloss',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
|
||
" key_params = {k: v for k, v in params_dict.items() \n",
|
||
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
" \n",
|
||
" # 评估参数\n",
|
||
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y_binary, PARAM_SEARCH_CONFIG['cv_folds'])\n",
|
||
" \n",
|
||
" results.append({\n",
|
||
" 'params': params_dict.copy(),\n",
|
||
" 'mean_score': mean_score,\n",
|
||
" 'std_score': std_score\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
|
||
" \n",
|
||
" # 更新最佳参数\n",
|
||
" if mean_score > best_score:\n",
|
||
" best_score = mean_score\n",
|
||
" best_params = params_dict.copy()\n",
|
||
" best_std = std_score\n",
|
||
" print(f\" ✨ 新的最佳参数!\")\n",
|
||
"\n",
|
||
"print(f\"\\n🏆 参数搜索完成!\")\n",
|
||
"print(f\"=\" * 80)\n",
|
||
"print(f\"🎯 最佳参数组合:\")\n",
|
||
"for key, value in best_params.items():\n",
|
||
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']:\n",
|
||
" print(f\" {key}: {value}\")\n",
|
||
"\n",
|
||
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
|
||
"\n",
|
||
"# 保存结果\n",
|
||
"BEST_PARAMS_SEARCH = best_params\n",
|
||
"PARAM_SEARCH_RESULTS = results\n",
|
||
"\n",
|
||
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
|
||
"print(f\" 最佳参数变量: BEST_PARAMS_SEARCH\")\n",
|
||
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS\")\n",
|
||
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
|
||
"\n",
|
||
"# 显示所有结果\n",
|
||
"print(f\"\\n🔝 所有参数组合结果:\")\n",
|
||
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
|
||
"for i, result in enumerate(sorted_results):\n",
|
||
" key_params = {k: v for k, v in result['params'].items() \n",
|
||
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
|
||
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
"\n",
|
||
"print(f\"\\n✅ 参数搜索完成!推荐使用最佳参数进行后续训练\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"检查文件: t15.2023.08.13_val_concatenated.npz\n",
|
||
"文件中的键: ['neural_logits_concatenated', 'confidence_scores', 'pred_seq', 'block_num', 'trial_num', 'session', 'sentence_label', 'seq_class_ids', 'seq_len']\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "Object arrays cannot be loaded when allow_pickle=False",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[22], line 18\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# 显示数据形状\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdata[key]\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mhasattr\u001b[39m(\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m,\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mtype\u001b[39m(data[key])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 20\u001b[0m data\u001b[38;5;241m.\u001b[39mclose()\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\_npyio_impl.py:258\u001b[0m, in \u001b[0;36mNpzFile.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m magic \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mMAGIC_PREFIX:\n\u001b[0;32m 257\u001b[0m \u001b[38;5;28mbytes\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mopen(key)\n\u001b[1;32m--> 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mbytes\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_pickle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mallow_pickle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43mpickle_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpickle_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_header_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_header_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 262\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mread(key)\n",
|
||
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\format.py:822\u001b[0m, in \u001b[0;36mread_array\u001b[1;34m(fp, allow_pickle, pickle_kwargs, max_header_size)\u001b[0m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mhasobject:\n\u001b[0;32m 820\u001b[0m \u001b[38;5;66;03m# The array contained Python objects. We need to unpickle the data.\u001b[39;00m\n\u001b[0;32m 821\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m allow_pickle:\n\u001b[1;32m--> 822\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject arrays cannot be loaded when \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 823\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_pickle=False\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 824\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pickle_kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 825\u001b[0m pickle_kwargs \u001b[38;5;241m=\u001b[39m {}\n",
|
||
"\u001b[1;31mValueError\u001b[0m: Object arrays cannot be loaded when allow_pickle=False"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 检查数据文件结构\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 检查第一个文件的键\n",
|
||
"test_file = val_files[0]\n",
|
||
"file_path = os.path.join(data_dir, test_file)\n",
|
||
"\n",
|
||
"print(f\"检查文件: {test_file}\")\n",
|
||
"data = np.load(file_path)\n",
|
||
"print(f\"文件中的键: {list(data.keys())}\")\n",
|
||
"\n",
|
||
"# 显示数据形状\n",
|
||
"for key in data.keys():\n",
|
||
" print(f\" {key}: {data[key].shape if hasattr(data[key], 'shape') else type(data[key])}\")\n",
|
||
"\n",
|
||
"data.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"检查文件: t15.2023.08.13_val_concatenated.npz\n",
|
||
"文件中的键: ['neural_logits_concatenated', 'confidence_scores', 'pred_seq', 'block_num', 'trial_num', 'session', 'sentence_label', 'seq_class_ids', 'seq_len']\n",
|
||
" neural_logits_concatenated: (35,)\n",
|
||
" seq_class_ids: (35, 500)\n",
|
||
" seq_class_ids 前10个值: [[37 34 40 ... 0 0 0]\n",
|
||
" [16 5 40 ... 0 0 0]\n",
|
||
" [23 1 31 ... 0 0 0]\n",
|
||
" ...\n",
|
||
" [10 17 29 ... 0 0 0]\n",
|
||
" [16 2 38 ... 0 0 0]\n",
|
||
" [36 33 9 ... 0 0 0]]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 检查数据文件结构(修复版)\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 检查第一个文件的键\n",
|
||
"test_file = val_files[0]\n",
|
||
"file_path = os.path.join(data_dir, test_file)\n",
|
||
"\n",
|
||
"print(f\"检查文件: {test_file}\")\n",
|
||
"data = np.load(file_path, allow_pickle=True)\n",
|
||
"print(f\"文件中的键: {list(data.keys())}\")\n",
|
||
"\n",
|
||
"# 检查neural_logits_concatenated和seq_class_ids的形状\n",
|
||
"if 'neural_logits_concatenated' in data:\n",
|
||
" neural_logits = data['neural_logits_concatenated']\n",
|
||
" print(f\" neural_logits_concatenated: {neural_logits.shape}\")\n",
|
||
"\n",
|
||
"if 'seq_class_ids' in data:\n",
|
||
" seq_class_ids = data['seq_class_ids']\n",
|
||
" print(f\" seq_class_ids: {seq_class_ids.shape}\")\n",
|
||
" print(f\" seq_class_ids 前10个值: {seq_class_ids[:10]}\")\n",
|
||
"\n",
|
||
"data.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔍 LightGBM参数搜索(最终版)...\n",
|
||
"================================================================================\n",
|
||
"📊 参数搜索配置:\n",
|
||
" 交叉验证折数: 3\n",
|
||
" 最大参数组合: 6\n",
|
||
" 最大样本数: 1000\n",
|
||
" 参数空间大小: 8 种组合\n",
|
||
"\n",
|
||
"📦 准备少量数据集...\n",
|
||
" 使用文件: t15.2023.08.13_val_concatenated.npz\n",
|
||
" 总样本数量: 962\n",
|
||
" 特征维度: 7209\n",
|
||
" 二分类标签分布: [732 230]\n",
|
||
"\n",
|
||
"🎯 生成参数组合...\n",
|
||
" 实际测试组合数: 6\n",
|
||
"\n",
|
||
"🚀 开始参数搜索...\n",
|
||
"\n",
|
||
"🔧 测试组合 1/6:\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 6}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.553502\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.553875\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's binary_logloss: 0.549469\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
" ✨ 新的最佳参数!\n",
|
||
"\n",
|
||
"🔧 测试组合 2/6:\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 8}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.554291\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.55475\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.55238\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"🔧 测试组合 3/6:\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 6}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.553502\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.553875\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's binary_logloss: 0.549469\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"🔧 测试组合 4/6:\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 8}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.554291\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.55475\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.55238\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"🔧 测试组合 5/6:\n",
|
||
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 6}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.556997\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.557785\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's binary_logloss: 0.551978\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"🔧 测试组合 6/6:\n",
|
||
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8}\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.55862\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's binary_logloss: 0.559016\n",
|
||
"Training until validation scores don't improve for 5 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's binary_logloss: 0.557107\n",
|
||
" 性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"🏆 参数搜索完成!\n",
|
||
"================================================================================\n",
|
||
"🎯 最佳参数组合:\n",
|
||
" learning_rate: 0.05\n",
|
||
" num_leaves: 31\n",
|
||
" max_depth: 6\n",
|
||
"\n",
|
||
"📈 最佳性能: 0.7609 ± 0.0011\n",
|
||
"\n",
|
||
"💾 参数搜索结果已保存:\n",
|
||
" 最佳参数变量: BEST_PARAMS_FINAL\n",
|
||
" 所有结果变量: PARAM_SEARCH_RESULTS_FINAL\n",
|
||
" 共测试了 6 种参数组合\n",
|
||
"\n",
|
||
"🔝 所有参数组合结果:\n",
|
||
" 1. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 6}\n",
|
||
" 2. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 8}\n",
|
||
" 3. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 6}\n",
|
||
" 4. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 8}\n",
|
||
" 5. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 6}\n",
|
||
" 6. 分数: 0.7609 ± 0.0011\n",
|
||
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8}\n",
|
||
"\n",
|
||
"✅ 参数搜索完成!可以在后续训练中使用这些优化参数\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔍 LightGBM参数搜索(最终版)...\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"import itertools\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"import lightgbm as lgb\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 参数搜索配置\n",
|
||
"PARAM_SEARCH_CONFIG = {\n",
|
||
" 'cv_folds': 3, # 3折交叉验证\n",
|
||
" 'max_combinations': 6, # 减少到6种参数组合\n",
|
||
" 'max_samples': 1000 # 最多使用1000个样本\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 定义参数搜索空间(精简版)\n",
|
||
"param_grid = {\n",
|
||
" 'learning_rate': [0.05, 0.1],\n",
|
||
" 'num_leaves': [31, 63],\n",
|
||
" 'max_depth': [6, 8],\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📊 参数搜索配置:\")\n",
|
||
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
|
||
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
|
||
"print(f\" 最大样本数: {PARAM_SEARCH_CONFIG['max_samples']}\")\n",
|
||
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
|
||
"\n",
|
||
"# 获取少量数据用于参数搜索\n",
|
||
"print(f\"\\n📦 准备少量数据集...\")\n",
|
||
"\n",
|
||
"# 使用绝对路径\n",
|
||
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 只使用第一个文件进行快速参数搜索\n",
|
||
"test_file = val_files[0]\n",
|
||
"file_path = os.path.join(data_dir, test_file)\n",
|
||
"\n",
|
||
"print(f\" 使用文件: {test_file}\")\n",
|
||
"\n",
|
||
"# 加载数据\n",
|
||
"data = np.load(file_path, allow_pickle=True)\n",
|
||
"neural_logits = data['neural_logits_concatenated']\n",
|
||
"seq_class_ids = data['seq_class_ids']\n",
|
||
"\n",
|
||
"# 处理数据\n",
|
||
"all_features = []\n",
|
||
"all_labels = []\n",
|
||
"\n",
|
||
"for i, logits in enumerate(neural_logits):\n",
|
||
" if logits is not None and hasattr(logits, 'shape') and len(logits.shape) > 0:\n",
|
||
" # 获取对应的标签序列\n",
|
||
" labels = seq_class_ids[i]\n",
|
||
" \n",
|
||
" # 只取前面的一部分数据\n",
|
||
" max_len = min(len(logits), len(labels), 50) # 最多50个时间步\n",
|
||
" \n",
|
||
" for j in range(max_len):\n",
|
||
" if labels[j] != 0: # 跳过padding标签\n",
|
||
" all_features.append(logits[j].flatten())\n",
|
||
" all_labels.append(labels[j])\n",
|
||
"\n",
|
||
"data.close()\n",
|
||
"\n",
|
||
"if len(all_features) == 0:\n",
|
||
" raise ValueError(\"没有找到有效的特征数据\")\n",
|
||
"\n",
|
||
"# 转换为numpy数组\n",
|
||
"sample_X = np.array(all_features)\n",
|
||
"sample_y = np.array(all_labels)\n",
|
||
"\n",
|
||
"# 限制样本数量\n",
|
||
"if len(sample_X) > PARAM_SEARCH_CONFIG['max_samples']:\n",
|
||
" indices = np.random.choice(len(sample_X), PARAM_SEARCH_CONFIG['max_samples'], replace=False)\n",
|
||
" sample_X = sample_X[indices]\n",
|
||
" sample_y = sample_y[indices]\n",
|
||
"\n",
|
||
"# 应用PCA变换(如果已经拟合)\n",
|
||
"if GLOBAL_PCA['is_fitted']:\n",
|
||
" sample_X = apply_pca_transform(sample_X)\n",
|
||
"\n",
|
||
"print(f\" 总样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
|
||
"\n",
|
||
"# 简化为二分类问题:标签40 vs 其他\n",
|
||
"sample_y_binary = (sample_y == 40).astype(int)\n",
|
||
"print(f\" 二分类标签分布: {np.bincount(sample_y_binary)}\")\n",
|
||
"\n",
|
||
"# 生成参数组合\n",
|
||
"print(f\"\\n🎯 生成参数组合...\")\n",
|
||
"param_names = list(param_grid.keys())\n",
|
||
"param_values = list(param_grid.values())\n",
|
||
"\n",
|
||
"# 获取所有参数组合\n",
|
||
"all_combinations = list(itertools.product(*param_values))\n",
|
||
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
|
||
"\n",
|
||
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
|
||
"\n",
|
||
"# 参数搜索函数\n",
|
||
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
|
||
" \"\"\"评估参数组合的性能\"\"\"\n",
|
||
" try:\n",
|
||
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
|
||
" scores = []\n",
|
||
" \n",
|
||
" for train_idx, val_idx in skf.split(X, y):\n",
|
||
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
|
||
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
|
||
" \n",
|
||
" # 创建LightGBM数据集\n",
|
||
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
|
||
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" model = lgb.train(\n",
|
||
" params_dict,\n",
|
||
" train_data,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" num_boost_round=20, # 少量轮数快速评估\n",
|
||
" callbacks=[lgb.early_stopping(5), lgb.log_evaluation(0)]\n",
|
||
" )\n",
|
||
" \n",
|
||
" # 预测和评估\n",
|
||
" y_pred = model.predict(X_val_fold)\n",
|
||
" y_pred_class = (y_pred > 0.5).astype(int)\n",
|
||
" \n",
|
||
" score = accuracy_score(y_val_fold, y_pred_class)\n",
|
||
" scores.append(score)\n",
|
||
" \n",
|
||
" return np.mean(scores), np.std(scores)\n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
|
||
" return 0.0, 1.0\n",
|
||
"\n",
|
||
"# 开始参数搜索\n",
|
||
"print(f\"\\n🚀 开始参数搜索...\")\n",
|
||
"best_score = 0\n",
|
||
"best_params = None\n",
|
||
"best_std = 1.0\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"for i, combination in enumerate(selected_combinations):\n",
|
||
" params_dict = dict(zip(param_names, combination))\n",
|
||
" \n",
|
||
" # 添加固定参数\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'binary',\n",
|
||
" 'metric': 'binary_logloss',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42,\n",
|
||
" 'feature_fraction': 0.8,\n",
|
||
" 'bagging_fraction': 0.8,\n",
|
||
" 'min_data_in_leaf': 20\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
|
||
" key_params = {k: v for k, v in params_dict.items() \n",
|
||
" if k in param_names}\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
" \n",
|
||
" # 评估参数\n",
|
||
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y_binary, PARAM_SEARCH_CONFIG['cv_folds'])\n",
|
||
" \n",
|
||
" results.append({\n",
|
||
" 'params': params_dict.copy(),\n",
|
||
" 'mean_score': mean_score,\n",
|
||
" 'std_score': std_score\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
|
||
" \n",
|
||
" # 更新最佳参数\n",
|
||
" if mean_score > best_score:\n",
|
||
" best_score = mean_score\n",
|
||
" best_params = params_dict.copy()\n",
|
||
" best_std = std_score\n",
|
||
" print(f\" ✨ 新的最佳参数!\")\n",
|
||
"\n",
|
||
"print(f\"\\n🏆 参数搜索完成!\")\n",
|
||
"print(f\"=\" * 80)\n",
|
||
"print(f\"🎯 最佳参数组合:\")\n",
|
||
"for key, value in best_params.items():\n",
|
||
" if key in param_names:\n",
|
||
" print(f\" {key}: {value}\")\n",
|
||
"\n",
|
||
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
|
||
"\n",
|
||
"# 保存结果\n",
|
||
"BEST_PARAMS_FINAL = best_params\n",
|
||
"PARAM_SEARCH_RESULTS_FINAL = results\n",
|
||
"\n",
|
||
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
|
||
"print(f\" 最佳参数变量: BEST_PARAMS_FINAL\")\n",
|
||
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS_FINAL\")\n",
|
||
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
|
||
"\n",
|
||
"# 显示所有结果\n",
|
||
"print(f\"\\n🔝 所有参数组合结果:\")\n",
|
||
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
|
||
"for i, result in enumerate(sorted_results):\n",
|
||
" key_params = {k: v for k, v in result['params'].items() \n",
|
||
" if k in param_names}\n",
|
||
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
"\n",
|
||
"print(f\"\\n✅ 参数搜索完成!可以在后续训练中使用这些优化参数\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"🔍 改进版参数搜索 - 扩大搜索空间和复杂度...\n",
|
||
"================================================================================\n",
|
||
"📊 改进的参数搜索配置:\n",
|
||
" 交叉验证折数: 5\n",
|
||
" 最大参数组合: 20\n",
|
||
" 最大样本数: 2000\n",
|
||
" 训练轮数: 100\n",
|
||
" 参数空间大小: 72000 种组合\n",
|
||
"\n",
|
||
"📦 准备更大的数据集...\n",
|
||
" 使用文件数: 3 / 41\n",
|
||
" 处理文件: t15.2023.08.13_val_concatenated.npz\n",
|
||
" 累计样本数: 962\n",
|
||
" 处理文件: t15.2023.08.18_val_concatenated.npz\n",
|
||
" 累计样本数: 2155\n",
|
||
" 处理文件: t15.2023.08.20_val_concatenated.npz\n",
|
||
" 累计样本数: 3414\n",
|
||
" 最终样本数量: 2000\n",
|
||
" 特征维度: 7209\n",
|
||
" 有效标签: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(6), np.int64(7), np.int64(9), np.int64(10), np.int64(11), np.int64(13)]\n",
|
||
" 过滤后样本数量: 492\n",
|
||
" 类别分布: [ 34 49 147 21 33 25 60 50 48 25]\n",
|
||
"\n",
|
||
"🎯 生成随机参数组合...\n",
|
||
" 生成了 20 个随机参数组合\n",
|
||
"\n",
|
||
"🚀 开始改进的参数搜索...\n",
|
||
"\n",
|
||
"🔧 测试组合 1/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.11651\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.14504\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.14196\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.2164\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.16782\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 准确率: 0.2825 ± 0.0299\n",
|
||
" F1分数: 0.0830 ± 0.0224\n",
|
||
" ✨ 新的最佳参数!F1分数: 0.0830\n",
|
||
"\n",
|
||
"🔧 测试组合 2/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(63), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.08376\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.09276\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.10016\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.12363\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.07434\n",
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0460 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 3/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.12531\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.2417\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.13152\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.20646\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.07364\n",
|
||
" 准确率: 0.2623 ± 0.0388\n",
|
||
" F1分数: 0.1037 ± 0.0407\n",
|
||
" ✨ 新的最佳参数!F1分数: 0.1037\n",
|
||
"\n",
|
||
"🔧 测试组合 4/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(31), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.099\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.09604\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.09527\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.1556\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.15148\n",
|
||
" 准确率: 0.2947 ± 0.0131\n",
|
||
" F1分数: 0.0580 ± 0.0173\n",
|
||
"\n",
|
||
"🔧 测试组合 5/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(127), 'max_depth': np.int64(12), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.09017\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.09111\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[4]\tvalid_0's multi_logloss: 2.09994\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.12214\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.10879\n",
|
||
" 准确率: 0.2967 ± 0.0034\n",
|
||
" F1分数: 0.0458 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 6/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(127), 'max_depth': np.int64(12), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.081\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.09223\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.08461\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.10818\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.11166\n",
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0461 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 7/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(15), 'max_depth': np.int64(6), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.08483\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08921\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08247\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.13921\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.08953\n",
|
||
" 准确率: 0.2967 ± 0.0123\n",
|
||
" F1分数: 0.0530 ± 0.0152\n",
|
||
"\n",
|
||
"🔧 测试组合 8/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(15), 'max_depth': np.int64(6), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[4]\tvalid_0's multi_logloss: 2.08538\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[4]\tvalid_0's multi_logloss: 2.08843\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[14]\tvalid_0's multi_logloss: 2.09424\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.11347\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[16]\tvalid_0's multi_logloss: 2.09417\n",
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0460 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 9/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.0937\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.09824\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08626\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.12937\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.07891\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 准确率: 0.3028 ± 0.0065\n",
|
||
" F1分数: 0.0588 ± 0.0162\n",
|
||
"\n",
|
||
"🔧 测试组合 10/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(127), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n",
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[7]\tvalid_0's multi_logloss: 2.08931\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[6]\tvalid_0's multi_logloss: 2.0929\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[11]\tvalid_0's multi_logloss: 2.08181\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.10389\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.09144\n",
|
||
" 准确率: 0.2968 ± 0.0049\n",
|
||
" F1分数: 0.0510 ± 0.0106\n",
|
||
"\n",
|
||
"🔧 测试组合 11/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(127), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.12579\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.16556\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.12374\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.17234\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.15226\n",
|
||
" 准确率: 0.2825 ± 0.0092\n",
|
||
" F1分数: 0.0699 ± 0.0227\n",
|
||
"\n",
|
||
"🔧 测试组合 12/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(3), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.07078\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.09673\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08357\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.14388\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.08769\n",
|
||
" 准确率: 0.2886 ± 0.0141\n",
|
||
" F1分数: 0.0496 ± 0.0075\n",
|
||
"\n",
|
||
"🔧 测试组合 13/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(255), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.13601\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.08359\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.06893\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.15841\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.1325\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 准确率: 0.2967 ± 0.0096\n",
|
||
" F1分数: 0.0585 ± 0.0161\n",
|
||
"\n",
|
||
"🔧 测试组合 14/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(63), 'max_depth': np.int64(3), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n",
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.08199\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.09063\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.08862\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[2]\tvalid_0's multi_logloss: 2.103\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.11438\n",
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0460 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 15/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(6), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.10926\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08871\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08299\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.14121\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.10462\n",
|
||
" 准确率: 0.2906 ± 0.0242\n",
|
||
" F1分数: 0.0563 ± 0.0152\n",
|
||
"\n",
|
||
"🔧 测试组合 16/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.01), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[13]\tvalid_0's multi_logloss: 2.09256\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[15]\tvalid_0's multi_logloss: 2.09021\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[18]\tvalid_0's multi_logloss: 2.09268\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.11506\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.11271\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0460 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 17/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.01), 'num_leaves': np.int64(63), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n",
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[26]\tvalid_0's multi_logloss: 2.08426\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[23]\tvalid_0's multi_logloss: 2.08892\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[23]\tvalid_0's multi_logloss: 2.08765\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.11129\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[14]\tvalid_0's multi_logloss: 2.09748\n",
|
||
" 准确率: 0.2988 ± 0.0035\n",
|
||
" F1分数: 0.0460 ± 0.0004\n",
|
||
"\n",
|
||
"🔧 测试组合 18/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(63), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[6]\tvalid_0's multi_logloss: 2.08507\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.10854\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.07903\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.11723\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.08765\n",
|
||
" 准确率: 0.3028 ± 0.0106\n",
|
||
" F1分数: 0.0549 ± 0.0180\n",
|
||
"\n",
|
||
"🔧 测试组合 19/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(15), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[5]\tvalid_0's multi_logloss: 2.08528\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.09118\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.10493\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.11337\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[4]\tvalid_0's multi_logloss: 2.0784\n",
|
||
" 准确率: 0.2967 ± 0.0034\n",
|
||
" F1分数: 0.0458 ± 0.0003\n",
|
||
"\n",
|
||
"🔧 测试组合 20/20:\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(63), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.06234\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.10061\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[4]\tvalid_0's multi_logloss: 2.10826\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[1]\tvalid_0's multi_logloss: 2.13882\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training until validation scores don't improve for 10 rounds\n",
|
||
"Early stopping, best iteration is:\n",
|
||
"[3]\tvalid_0's multi_logloss: 2.07953\n",
|
||
" 准确率: 0.3008 ± 0.0041\n",
|
||
" F1分数: 0.0580 ± 0.0169\n",
|
||
"\n",
|
||
"🏆 改进的参数搜索完成!\n",
|
||
"================================================================================\n",
|
||
"🎯 最佳参数组合:\n",
|
||
" learning_rate: 0.2\n",
|
||
" num_leaves: 31\n",
|
||
" max_depth: 12\n",
|
||
" feature_fraction: 0.8\n",
|
||
" bagging_fraction: 0.8\n",
|
||
" min_data_in_leaf: 10\n",
|
||
" lambda_l1: 0.0\n",
|
||
" lambda_l2: 0.0\n",
|
||
"\n",
|
||
"📈 最佳性能:\n",
|
||
" accuracy: 0.2623 ± 0.0388\n",
|
||
" f1_macro: 0.1037 ± 0.0407\n",
|
||
" precision_macro: 0.1395 ± 0.0853\n",
|
||
" recall_macro: 0.1224 ± 0.0320\n",
|
||
"\n",
|
||
"💾 改进的参数搜索结果已保存:\n",
|
||
" 最佳参数变量: BEST_PARAMS_IMPROVED\n",
|
||
" 所有结果变量: PARAM_SEARCH_RESULTS_IMPROVED\n",
|
||
"\n",
|
||
"🔝 Top 5 参数组合 (按F1分数排序):\n",
|
||
" 1. F1: 0.1037, 准确率: 0.2623\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n",
|
||
" 2. F1: 0.0830, 准确率: 0.2825\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
|
||
" 3. F1: 0.0699, 准确率: 0.2825\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(127), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n",
|
||
" 4. F1: 0.0588, 准确率: 0.3028\n",
|
||
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n",
|
||
" 5. F1: 0.0585, 准确率: 0.2967\n",
|
||
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(255), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n",
|
||
"\n",
|
||
"✅ 改进的参数搜索完成!现在应该能看到不同参数组合的性能差异\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"🔍 改进版参数搜索 - 扩大搜索空间和复杂度...\")\n",
|
||
"print(\"=\" * 80)\n",
|
||
"\n",
|
||
"import itertools\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n",
|
||
"import lightgbm as lgb\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 改进的参数搜索配置\n",
|
||
"IMPROVED_PARAM_CONFIG = {\n",
|
||
" 'cv_folds': 5, # 增加到5折交叉验证\n",
|
||
" 'max_combinations': 20, # 增加参数组合数\n",
|
||
" 'max_samples': 2000, # 增加样本数\n",
|
||
" 'num_boost_round': 100, # 增加训练轮数\n",
|
||
" 'early_stopping_rounds': 10\n",
|
||
"}\n",
|
||
"\n",
|
||
"# 扩大参数搜索空间\n",
|
||
"param_grid = {\n",
|
||
" 'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2], # 更大范围的学习率\n",
|
||
" 'num_leaves': [15, 31, 63, 127, 255], # 更大范围的叶子数\n",
|
||
" 'max_depth': [3, 6, 9, 12, -1], # 包括无限深度\n",
|
||
" 'feature_fraction': [0.6, 0.8, 0.9, 1.0], # 特征采样比例\n",
|
||
" 'bagging_fraction': [0.6, 0.8, 0.9, 1.0], # 数据采样比例\n",
|
||
" 'min_data_in_leaf': [5, 10, 20, 50], # 叶子节点最小样本数\n",
|
||
" 'lambda_l1': [0, 0.1, 1.0], # L1正则化\n",
|
||
" 'lambda_l2': [0, 0.1, 1.0], # L2正则化\n",
|
||
"}\n",
|
||
"\n",
|
||
"print(f\"📊 改进的参数搜索配置:\")\n",
|
||
"print(f\" 交叉验证折数: {IMPROVED_PARAM_CONFIG['cv_folds']}\")\n",
|
||
"print(f\" 最大参数组合: {IMPROVED_PARAM_CONFIG['max_combinations']}\")\n",
|
||
"print(f\" 最大样本数: {IMPROVED_PARAM_CONFIG['max_samples']}\")\n",
|
||
"print(f\" 训练轮数: {IMPROVED_PARAM_CONFIG['num_boost_round']}\")\n",
|
||
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
|
||
"\n",
|
||
"# 获取更多数据\n",
|
||
"print(f\"\\n📦 准备更大的数据集...\")\n",
|
||
"\n",
|
||
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
|
||
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
|
||
"\n",
|
||
"# 使用前3个文件\n",
|
||
"sample_files = val_files[:3]\n",
|
||
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
|
||
"\n",
|
||
"# 加载更多数据\n",
|
||
"all_features = []\n",
|
||
"all_labels = []\n",
|
||
"\n",
|
||
"for file in sample_files:\n",
|
||
" file_path = os.path.join(data_dir, file)\n",
|
||
" try:\n",
|
||
" data = np.load(file_path, allow_pickle=True)\n",
|
||
" neural_logits = data['neural_logits_concatenated']\n",
|
||
" seq_class_ids = data['seq_class_ids']\n",
|
||
" \n",
|
||
" print(f\" 处理文件: {file}\")\n",
|
||
" \n",
|
||
" for i, logits in enumerate(neural_logits):\n",
|
||
" if logits is not None and hasattr(logits, 'shape') and len(logits.shape) > 0:\n",
|
||
" labels = seq_class_ids[i]\n",
|
||
" \n",
|
||
" # 取更多的时间步\n",
|
||
" max_len = min(len(logits), len(labels), 100) # 增加到100个时间步\n",
|
||
" \n",
|
||
" for j in range(max_len):\n",
|
||
" if labels[j] != 0: # 跳过padding标签\n",
|
||
" all_features.append(logits[j].flatten())\n",
|
||
" all_labels.append(labels[j])\n",
|
||
" \n",
|
||
" data.close()\n",
|
||
" print(f\" 累计样本数: {len(all_features)}\")\n",
|
||
" \n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
|
||
"\n",
|
||
"if len(all_features) == 0:\n",
|
||
" raise ValueError(\"没有找到有效的特征数据\")\n",
|
||
"\n",
|
||
"# 转换为numpy数组\n",
|
||
"sample_X = np.array(all_features)\n",
|
||
"sample_y = np.array(all_labels)\n",
|
||
"\n",
|
||
"# 随机采样\n",
|
||
"if len(sample_X) > IMPROVED_PARAM_CONFIG['max_samples']:\n",
|
||
" indices = np.random.choice(len(sample_X), IMPROVED_PARAM_CONFIG['max_samples'], replace=False)\n",
|
||
" sample_X = sample_X[indices]\n",
|
||
" sample_y = sample_y[indices]\n",
|
||
"\n",
|
||
"# 应用PCA变换\n",
|
||
"if GLOBAL_PCA['is_fitted']:\n",
|
||
" sample_X = apply_pca_transform(sample_X)\n",
|
||
"\n",
|
||
"print(f\" 最终样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
|
||
"\n",
|
||
"# 创建多分类任务(保留前10个类别)\n",
|
||
"valid_labels = []\n",
|
||
"for label in np.unique(sample_y):\n",
|
||
" if np.sum(sample_y == label) >= 20: # 每个类别至少20个样本\n",
|
||
" valid_labels.append(label)\n",
|
||
"\n",
|
||
"valid_labels = sorted(valid_labels)[:10] # 取前10个有效标签\n",
|
||
"print(f\" 有效标签: {valid_labels}\")\n",
|
||
"\n",
|
||
"# 过滤数据,只保留有效标签\n",
|
||
"mask = np.isin(sample_y, valid_labels)\n",
|
||
"sample_X = sample_X[mask]\n",
|
||
"sample_y = sample_y[mask]\n",
|
||
"\n",
|
||
"# 重新映射标签到0-9\n",
|
||
"label_mapping = {old_label: new_label for new_label, old_label in enumerate(valid_labels)}\n",
|
||
"sample_y_mapped = np.array([label_mapping[label] for label in sample_y])\n",
|
||
"\n",
|
||
"print(f\" 过滤后样本数量: {len(sample_X)}\")\n",
|
||
"print(f\" 类别分布: {np.bincount(sample_y_mapped)}\")\n",
|
||
"\n",
|
||
"# 随机参数搜索\n",
|
||
"print(f\"\\n🎯 生成随机参数组合...\")\n",
|
||
"np.random.seed(42)\n",
|
||
"\n",
|
||
"def random_params():\n",
|
||
" \"\"\"生成随机参数组合\"\"\"\n",
|
||
" return {\n",
|
||
" 'learning_rate': np.random.choice(param_grid['learning_rate']),\n",
|
||
" 'num_leaves': np.random.choice(param_grid['num_leaves']),\n",
|
||
" 'max_depth': np.random.choice(param_grid['max_depth']),\n",
|
||
" 'feature_fraction': np.random.choice(param_grid['feature_fraction']),\n",
|
||
" 'bagging_fraction': np.random.choice(param_grid['bagging_fraction']),\n",
|
||
" 'min_data_in_leaf': np.random.choice(param_grid['min_data_in_leaf']),\n",
|
||
" 'lambda_l1': np.random.choice(param_grid['lambda_l1']),\n",
|
||
" 'lambda_l2': np.random.choice(param_grid['lambda_l2'])\n",
|
||
" }\n",
|
||
"\n",
|
||
"selected_combinations = [random_params() for _ in range(IMPROVED_PARAM_CONFIG['max_combinations'])]\n",
|
||
"print(f\" 生成了 {len(selected_combinations)} 个随机参数组合\")\n",
|
||
"\n",
|
||
"# 改进的参数评估函数\n",
|
||
"def evaluate_params_improved(params_dict, X, y, cv_folds=5):\n",
|
||
" \"\"\"改进的参数评估函数,返回多个指标\"\"\"\n",
|
||
" try:\n",
|
||
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
|
||
" scores = {\n",
|
||
" 'accuracy': [],\n",
|
||
" 'f1_macro': [],\n",
|
||
" 'precision_macro': [],\n",
|
||
" 'recall_macro': []\n",
|
||
" }\n",
|
||
" \n",
|
||
" for train_idx, val_idx in skf.split(X, y):\n",
|
||
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
|
||
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
|
||
" \n",
|
||
" # 创建LightGBM数据集\n",
|
||
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
|
||
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
|
||
" \n",
|
||
" # 训练模型\n",
|
||
" model = lgb.train(\n",
|
||
" params_dict,\n",
|
||
" train_data,\n",
|
||
" valid_sets=[val_data],\n",
|
||
" num_boost_round=IMPROVED_PARAM_CONFIG['num_boost_round'],\n",
|
||
" callbacks=[\n",
|
||
" lgb.early_stopping(IMPROVED_PARAM_CONFIG['early_stopping_rounds']), \n",
|
||
" lgb.log_evaluation(0)\n",
|
||
" ]\n",
|
||
" )\n",
|
||
" \n",
|
||
" # 预测和评估\n",
|
||
" y_pred = model.predict(X_val_fold)\n",
|
||
" if len(np.unique(y)) > 2: # 多分类\n",
|
||
" y_pred_class = np.argmax(y_pred, axis=1)\n",
|
||
" else: # 二分类\n",
|
||
" y_pred_class = (y_pred > 0.5).astype(int)\n",
|
||
" \n",
|
||
" # 计算多个指标\n",
|
||
" scores['accuracy'].append(accuracy_score(y_val_fold, y_pred_class))\n",
|
||
" scores['f1_macro'].append(f1_score(y_val_fold, y_pred_class, average='macro'))\n",
|
||
" scores['precision_macro'].append(precision_score(y_val_fold, y_pred_class, average='macro'))\n",
|
||
" scores['recall_macro'].append(recall_score(y_val_fold, y_pred_class, average='macro'))\n",
|
||
" \n",
|
||
" # 计算平均值和标准差\n",
|
||
" result = {}\n",
|
||
" for metric, values in scores.items():\n",
|
||
" result[metric] = {\n",
|
||
" 'mean': np.mean(values),\n",
|
||
" 'std': np.std(values)\n",
|
||
" }\n",
|
||
" \n",
|
||
" return result\n",
|
||
" \n",
|
||
" except Exception as e:\n",
|
||
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
|
||
" return {\n",
|
||
" 'accuracy': {'mean': 0.0, 'std': 1.0},\n",
|
||
" 'f1_macro': {'mean': 0.0, 'std': 1.0},\n",
|
||
" 'precision_macro': {'mean': 0.0, 'std': 1.0},\n",
|
||
" 'recall_macro': {'mean': 0.0, 'std': 1.0}\n",
|
||
" }\n",
|
||
"\n",
|
||
"# 开始改进的参数搜索\n",
|
||
"print(f\"\\n🚀 开始改进的参数搜索...\")\n",
|
||
"best_score = 0\n",
|
||
"best_params = None\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"for i, params in enumerate(selected_combinations):\n",
|
||
" # 添加固定参数\n",
|
||
" params_dict = params.copy()\n",
|
||
" params_dict.update({\n",
|
||
" 'objective': 'multiclass',\n",
|
||
" 'num_class': len(np.unique(sample_y_mapped)),\n",
|
||
" 'metric': 'multi_logloss',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42\n",
|
||
" })\n",
|
||
" \n",
|
||
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
|
||
" key_params = {k: v for k, v in params.items()}\n",
|
||
" print(f\" 参数: {key_params}\")\n",
|
||
" \n",
|
||
" # 评估参数\n",
|
||
" metrics = evaluate_params_improved(params_dict, sample_X, sample_y_mapped, IMPROVED_PARAM_CONFIG['cv_folds'])\n",
|
||
" \n",
|
||
" result = {\n",
|
||
" 'params': params.copy(),\n",
|
||
" 'metrics': metrics\n",
|
||
" }\n",
|
||
" results.append(result)\n",
|
||
" \n",
|
||
" accuracy = metrics['accuracy']['mean']\n",
|
||
" f1 = metrics['f1_macro']['mean']\n",
|
||
" \n",
|
||
" print(f\" 准确率: {accuracy:.4f} ± {metrics['accuracy']['std']:.4f}\")\n",
|
||
" print(f\" F1分数: {f1:.4f} ± {metrics['f1_macro']['std']:.4f}\")\n",
|
||
" \n",
|
||
" # 使用F1分数作为主要评估指标\n",
|
||
" if f1 > best_score:\n",
|
||
" best_score = f1\n",
|
||
" best_params = params.copy()\n",
|
||
" print(f\" ✨ 新的最佳参数!F1分数: {f1:.4f}\")\n",
|
||
"\n",
|
||
"print(f\"\\n🏆 改进的参数搜索完成!\")\n",
|
||
"print(f\"=\" * 80)\n",
|
||
"print(f\"🎯 最佳参数组合:\")\n",
|
||
"for key, value in best_params.items():\n",
|
||
" print(f\" {key}: {value}\")\n",
|
||
"\n",
|
||
"print(f\"\\n📈 最佳性能:\")\n",
|
||
"best_result = max(results, key=lambda x: x['metrics']['f1_macro']['mean'])\n",
|
||
"for metric, values in best_result['metrics'].items():\n",
|
||
" print(f\" {metric}: {values['mean']:.4f} ± {values['std']:.4f}\")\n",
|
||
"\n",
|
||
"# 保存结果\n",
|
||
"BEST_PARAMS_IMPROVED = best_params\n",
|
||
"PARAM_SEARCH_RESULTS_IMPROVED = results\n",
|
||
"\n",
|
||
"print(f\"\\n💾 改进的参数搜索结果已保存:\")\n",
|
||
"print(f\" 最佳参数变量: BEST_PARAMS_IMPROVED\")\n",
|
||
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS_IMPROVED\")\n",
|
||
"\n",
|
||
"# 显示前5个最佳结果\n",
|
||
"print(f\"\\n🔝 Top 5 参数组合 (按F1分数排序):\")\n",
|
||
"sorted_results = sorted(results, key=lambda x: x['metrics']['f1_macro']['mean'], reverse=True)\n",
|
||
"for i, result in enumerate(sorted_results[:5]):\n",
|
||
" f1 = result['metrics']['f1_macro']['mean']\n",
|
||
" accuracy = result['metrics']['accuracy']['mean']\n",
|
||
" print(f\" {i+1}. F1: {f1:.4f}, 准确率: {accuracy:.4f}\")\n",
|
||
" print(f\" 参数: {result['params']}\")\n",
|
||
"\n",
|
||
"print(f\"\\n✅ 改进的参数搜索完成!现在应该能看到不同参数组合的性能差异\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"OPTIMIZED_PARAMS = {\n",
|
||
" 'learning_rate': 0.2, # 较高学习率\n",
|
||
" 'num_leaves': 31, # 适中的叶子数\n",
|
||
" 'max_depth': 12, # 较深的树\n",
|
||
" 'feature_fraction': 0.8, # 特征采样\n",
|
||
" 'bagging_fraction': 0.8, # 数据采样\n",
|
||
" 'min_data_in_leaf': 10, # 较小的叶子节点样本数\n",
|
||
" 'lambda_l1': 0.0, # 无L1正则化\n",
|
||
" 'lambda_l2': 0.0, # 无L2正则化\n",
|
||
" 'objective': 'multiclass',\n",
|
||
" 'boosting_type': 'gbdt',\n",
|
||
" 'verbosity': -1,\n",
|
||
" 'seed': 42\n",
|
||
"}"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kaggle": {
|
||
"accelerator": "tpu1vmV38",
|
||
"dataSources": [
|
||
{
|
||
"databundleVersionId": 13056355,
|
||
"sourceId": 106809,
|
||
"sourceType": "competition"
|
||
}
|
||
],
|
||
"dockerImageVersionId": 31091,
|
||
"isGpuEnabled": false,
|
||
"isInternetEnabled": true,
|
||
"language": "python",
|
||
"sourceType": "notebook"
|
||
},
|
||
"kernelspec": {
|
||
"display_name": "b2txt25",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.18"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|