Files
b2txt25/brain-to-text-25/brain-to-text-25 LGBM LU PCA 全量训练 本机环境 测试 和参数优化.ipynb

5344 lines
246 KiB
Plaintext
Raw Normal View History

2025-10-06 15:17:44 +08:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 环境配置与Utils"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# # %%bash\n",
"# rm -rf /kaggle/working/nejm-brain-to-text/\n",
"# git clone https://github.com/ZH-CEN/nejm-brain-to-text.git\n",
"# cp /kaggle/input/brain-to-text-baseline-model/t15_copyTask.pkl /kaggle/working/nejm-brain-to-text/data/t15_copyTask.pkl\n",
"\n",
"# ln -s /kaggle/input/brain-to-text-25/t15_pretrained_rnn_baseline/t15_pretrained_rnn_baseline /kaggle/working/nejm-brain-to-text/data\n",
"# ln -s /kaggle/input/brain-to-text-25/t15_copyTask_neuralData/hdf5_data_final /kaggle/working/nejm-brain-to-text/data\n",
"# ln -s /kaggle/input/rnn-pretagged-data /kaggle/working/nejm-brain-to-text/data/concatenated_data\n",
"\n",
"# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126\n",
"\n",
"# pip install \\\n",
"# jupyter==1.1.1 \\\n",
"# \"numpy>=1.26.0,<2.1.0\" \\\n",
"# pandas==2.3.0 \\\n",
"# matplotlib==3.10.1 \\\n",
"# scipy==1.15.2 \\\n",
"# scikit-learn==1.6.1 \\\n",
"# lightgbm==4.3.0 \\\n",
"# tqdm==4.67.1 \\\n",
"# g2p_en==2.1.0 \\\n",
"# h5py==3.13.0 \\\n",
"# omegaconf==2.3.0 \\\n",
"# editdistance==0.8.1 \\\n",
"# huggingface-hub==0.33.1 \\\n",
"# transformers==4.53.0 \\\n",
"# tokenizers==0.21.2 \\\n",
"# accelerate==1.8.1 \\\n",
"# bitsandbytes==0.46.0 \\\n",
"# seaborn==0.13.2\n",
"# cd /kaggle/working/nejm-brain-to-text/\n",
"# pip install -e ."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==================================================\n",
"🔧 LightGBM GPU环境检查\n",
"==================================================\n",
"❌ 未检测到NVIDIA GPU或驱动\n",
"\n",
"❌ 未安装CUDA工具包\n"
]
}
],
"source": [
"# 🚀 LightGBM GPU支持检查与配置\n",
"\n",
"print(\"=\"*50)\n",
"print(\"🔧 LightGBM GPU环境检查\")\n",
"print(\"=\"*50)\n",
"\n",
"# 检查CUDA和GPU驱动\n",
"import subprocess\n",
"import sys\n",
"\n",
"def run_command(command):\n",
" \"\"\"运行命令并返回结果\"\"\"\n",
" try:\n",
" result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=10)\n",
" return result.stdout.strip(), result.returncode == 0\n",
" except Exception as e:\n",
" return str(e), False\n",
"\n",
"# 检查NVIDIA GPU\n",
"nvidia_output, nvidia_success = run_command(\"nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader,nounits\")\n",
"if nvidia_success:\n",
" print(\"✅ NVIDIA GPU检测:\")\n",
" for line in nvidia_output.split('\\n'):\n",
" if line.strip():\n",
" print(f\" {line}\")\n",
"else:\n",
" print(\"❌ 未检测到NVIDIA GPU或驱动\")\n",
"\n",
"# 检查CUDA版本\n",
"cuda_output, cuda_success = run_command(\"nvcc --version\")\n",
"if cuda_success:\n",
" print(\"\\n✅ CUDA工具包:\")\n",
" # 提取CUDA版本\n",
" for line in cuda_output.split('\\n'):\n",
" if 'release' in line:\n",
" print(f\" {line.strip()}\")\n",
"else:\n",
" print(\"\\n❌ 未安装CUDA工具包\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# %cd /kaggle/working/nejm-brain-to-text\n",
"import numpy as np\n",
"import os\n",
"import pickle\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"from g2p_en import G2p\n",
"import pandas as pd\n",
"import numpy as np\n",
"from nejm_b2txt_utils.general_utils import *\n",
"matplotlib.rcParams['pdf.fonttype'] = 42\n",
"matplotlib.rcParams['ps.fonttype'] = 42\n",
"matplotlib.rcParams['font.family'] = 'sans-serif'\n",
"matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'sans-serif']\n",
"matplotlib.rcParams['axes.unicode_minus'] = False\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\IPython\\core\\magics\\osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
" self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\model_training\n"
]
}
],
"source": [
"%cd ../model_training/\n",
"from data_augmentations import gauss_smooth"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"LOGIT_TO_PHONEME = [\n",
" 'BLANK',\n",
" 'AA', 'AE', 'AH', 'AO', 'AW',\n",
" 'AY', 'B', 'CH', 'D', 'DH',\n",
" 'EH', 'ER', 'EY', 'F', 'G',\n",
" 'HH', 'IH', 'IY', 'JH', 'K',\n",
" 'L', 'M', 'N', 'NG', 'OW',\n",
" 'OY', 'P', 'R', 'S', 'SH',\n",
" 'T', 'TH', 'UH', 'UW', 'V',\n",
" 'W', 'Y', 'Z', 'ZH',\n",
" ' | ',\n",
"]\n",
"# 全局配置\n",
"BALANCE_CONFIG = {\n",
" 'enable_balance': True, # 是否启用数据平衡\n",
" 'undersample_labels': [0, 40], # 需要下采样的标签 (blank等高频标签)\n",
" 'oversample_threshold': 0.5, # 过采样阈值 (相对于均值的比例)\n",
" 'random_state': 42 # 随机种子\n",
"}\n",
"# 全局PCA配置\n",
"PCA_CONFIG = {\n",
" 'enable_pca': True, # 是否启用PCA\n",
" 'n_components': None, # None=自动选择, 或指定具体数值\n",
" 'variance_threshold': 0.95, # 保留95%的方差\n",
" 'sample_size': 15000, # 用于拟合PCA的样本数\n",
"}\n",
"\n",
"# 全局PCA对象 (确保只拟合一次)\n",
"GLOBAL_PCA = {\n",
" 'scaler': None,\n",
" 'pca': None,\n",
" 'is_fitted': False,\n",
" 'n_components': None\n",
"}\n",
"# 设置数据目录和参数【PCA初始化】\n",
"data_dir = '../data/concatenated_data'\n",
"MAX_SAMPLES_PER_FILE = -1 # 每个文件最大样本数,可调整"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 数据读取工作流"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2⃣ 数据加载与PCA降维"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 🚀 内存友好的数据读取 - 分批加载策略 + PCA降维 【这里还缺一个采样】\n",
"\n",
"import os\n",
"import numpy as np\n",
"import gc\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"import joblib\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"def load_data_batch(data_dir, data_type, max_samples_per_file=5000):\n",
" \"\"\"\n",
" 分批加载指定类型的数据\n",
" \n",
" Args:\n",
" data_dir: 数据目录\n",
" data_type: 'train', 'val', 'test'\n",
" max_samples_per_file: 每个文件最大加载样本数\n",
" \n",
" Returns:\n",
" generator: 数据批次生成器\n",
" \"\"\"\n",
" files = [f for f in os.listdir(data_dir) if f.endswith('.npz') and data_type in f]\n",
" \n",
" for file_idx, f in enumerate(files):\n",
" print(f\" 正在加载文件 {file_idx+1}/{len(files)}: {f}\")\n",
" \n",
" data = np.load(os.path.join(data_dir, f), allow_pickle=True)\n",
" trials = data['neural_logits_concatenated']\n",
" \n",
" # 限制每个文件的样本数\n",
" if len(trials) > max_samples_per_file and max_samples_per_file != -1:\n",
" trials = trials[:max_samples_per_file]\n",
" print(f\" 限制样本数至: {max_samples_per_file}\")\n",
" \n",
" yield trials, f\n",
" \n",
" # 清理内存\n",
" del data, trials\n",
" gc.collect()\n",
"\n",
"def extract_features_labels_batch(trials_batch):\n",
" \"\"\"\n",
" 从试验批次中提取特征和标签\n",
" \"\"\"\n",
" features = []\n",
" labels = []\n",
" \n",
" for trial in trials_batch:\n",
" if trial.shape[0] > 0:\n",
" for t in range(trial.shape[0]):\n",
" neural_features = trial[t, :7168] # 前7168维神经特征\n",
" rnn_logits = trial[t, 7168:] # 后41维RNN输出\n",
" phoneme_label = np.argmax(rnn_logits)\n",
" features.append(neural_features)\n",
" labels.append(phoneme_label)\n",
" \n",
" return np.array(features), np.array(labels)\n",
"\n",
"def fit_global_pca(data_dir, config):\n",
" \"\"\"\n",
" 在训练数据上拟合全局PCA (只执行一次)\n",
" \"\"\"\n",
" if GLOBAL_PCA['is_fitted'] or not config['enable_pca']:\n",
" print(\"🔧 PCA已拟合或未启用跳过拟合步骤\")\n",
" return\n",
" \n",
" print(f\"\\n🔧 拟合全局PCA降维器...\")\n",
" print(f\" 配置: {config}\")\n",
" \n",
" # 收集训练样本\n",
" sample_features = []\n",
" collected_samples = 0\n",
" \n",
" for trials_batch, filename in load_data_batch(data_dir, 'train', 5000):\n",
" features, labels = extract_features_labels_batch(trials_batch)\n",
" sample_features.append(features)\n",
" collected_samples += features.shape[0]\n",
" \n",
" if collected_samples >= config['sample_size']:\n",
" break\n",
" \n",
" if sample_features:\n",
" # 合并样本数据\n",
" X_sample = np.vstack(sample_features)[:config['sample_size']]\n",
" print(f\" 实际样本数: {X_sample.shape[0]}\")\n",
" print(f\" 原始特征数: {X_sample.shape[1]}\")\n",
" \n",
" # 标准化\n",
" GLOBAL_PCA['scaler'] = StandardScaler()\n",
" X_sample_scaled = GLOBAL_PCA['scaler'].fit_transform(X_sample)\n",
" \n",
" # 确定PCA成分数\n",
" if config['n_components'] is None:\n",
" print(f\" 🔍 自动选择PCA成分数...\")\n",
" pca_full = PCA()\n",
" pca_full.fit(X_sample_scaled)\n",
" \n",
" cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)\n",
" optimal_components = np.argmax(cumsum_var >= config['variance_threshold']) + 1\n",
" GLOBAL_PCA['n_components'] = min(optimal_components, X_sample.shape[1])\n",
" \n",
" print(f\" 保留{config['variance_threshold']*100}%方差需要: {optimal_components} 个成分\")\n",
" print(f\" 选择成分数: {GLOBAL_PCA['n_components']}\")\n",
" else:\n",
" GLOBAL_PCA['n_components'] = config['n_components']\n",
" print(f\" 使用指定成分数: {GLOBAL_PCA['n_components']}\")\n",
" \n",
" # 拟合最终PCA\n",
" GLOBAL_PCA['pca'] = PCA(n_components=GLOBAL_PCA['n_components'], random_state=42)\n",
" GLOBAL_PCA['pca'].fit(X_sample_scaled)\n",
" GLOBAL_PCA['is_fitted'] = True\n",
" \n",
" # 保存模型\n",
" pca_path = \"global_pca_model.joblib\"\n",
" joblib.dump({\n",
" 'scaler': GLOBAL_PCA['scaler'], \n",
" 'pca': GLOBAL_PCA['pca'],\n",
" 'n_components': GLOBAL_PCA['n_components']\n",
" }, pca_path)\n",
" \n",
" print(f\" ✅ 全局PCA拟合完成!\")\n",
" print(f\" 降维: {X_sample.shape[1]} → {GLOBAL_PCA['n_components']}\")\n",
" print(f\" 降维比例: {GLOBAL_PCA['n_components']/X_sample.shape[1]:.2%}\")\n",
" print(f\" 保留方差: {GLOBAL_PCA['pca'].explained_variance_ratio_.sum():.4f}\")\n",
" print(f\" 模型已保存: {pca_path}\")\n",
" \n",
" # 清理样本数据\n",
" del sample_features, X_sample, X_sample_scaled\n",
" gc.collect()\n",
" else:\n",
" print(\"❌ 无法收集样本数据用于PCA拟合\")\n",
"\n",
"def apply_pca_transform(features):\n",
" \"\"\"\n",
" 应用全局PCA变换\n",
" \"\"\"\n",
" if not PCA_CONFIG['enable_pca'] or not GLOBAL_PCA['is_fitted']:\n",
" return features\n",
" \n",
" # 标准化 + PCA变换\n",
" features_scaled = GLOBAL_PCA['scaler'].transform(features)\n",
" features_pca = GLOBAL_PCA['pca'].transform(features_scaled)\n",
" return features_pca\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📊 数据平衡策略 - 标签分布分析与采样优化"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 【采样核心实现】\n",
"def balance_dataset(X, y, config=BALANCE_CONFIG):\n",
" \"\"\"\n",
" 对数据集进行平衡处理:下采样 + 过采样\n",
" \n",
" Args:\n",
" X: 特征数据\n",
" y: 标签数据\n",
" config: 平衡配置\n",
" \n",
" Returns:\n",
" X_balanced, y_balanced: 平衡后的数据\n",
" \"\"\"\n",
" if not config['enable_balance']:\n",
" print(\"🔕 数据平衡已禁用,返回原始数据\")\n",
" return X, y\n",
" \n",
" print(f\"\\n⚖ 开始数据平衡处理...\")\n",
" print(f\" 原始数据: {X.shape[0]:,} 样本\")\n",
" \n",
" # 分析当前分布 (只考虑1-39号标签的均值)\n",
" label_counts = Counter(y)\n",
" counts_exclude_0_40 = [label_counts.get(i, 0) for i in range(1, 40)] # 1-39号标签\n",
" mean_count = np.mean(counts_exclude_0_40) # 只计算1-39号标签的均值\n",
" \n",
" print(f\" 均值样本数 (标签1-39): {mean_count:.0f}\")\n",
" print(f\" 下采样标签: {config['undersample_labels']}\")\n",
" print(f\" 过采样阈值: {config['oversample_threshold']} * 均值\")\n",
" \n",
" # 准备平衡后的数据\n",
" X_balanced = []\n",
" y_balanced = []\n",
" \n",
" random.seed(config['random_state'])\n",
" np.random.seed(config['random_state'])\n",
" \n",
" for label in range(41):\n",
" # 获取当前标签的所有样本\n",
" label_mask = (y == label)\n",
" X_label = X[label_mask]\n",
" y_label = y[label_mask]\n",
" current_count = len(y_label)\n",
" \n",
" if current_count == 0:\n",
" continue\n",
" \n",
" # 决定采样策略\n",
" if label in config['undersample_labels']:\n",
" # 下采样到均值水平\n",
" target_count = int(mean_count)\n",
" if current_count > target_count:\n",
" # 下采样\n",
" indices = np.random.choice(current_count, target_count, replace=False)\n",
" X_resampled = X_label[indices]\n",
" y_resampled = y_label[indices]\n",
" print(f\" 📉 标签 {label}: {current_count} → {target_count} (下采样)\")\n",
" else:\n",
" X_resampled = X_label\n",
" y_resampled = y_label\n",
" print(f\" ➡️ 标签 {label}: {current_count} (无需下采样)\")\n",
" \n",
" elif current_count < mean_count * config['oversample_threshold']:\n",
" # 过采样到阈值水平\n",
" target_count = int(mean_count * config['oversample_threshold'])\n",
" if current_count < target_count:\n",
" # 过采样\n",
" X_resampled, y_resampled = resample(\n",
" X_label, y_label, \n",
" n_samples=target_count, \n",
" random_state=config['random_state']\n",
" )\n",
" print(f\" 📈 标签 {label}: {current_count} → {target_count} (过采样)\")\n",
" else:\n",
" X_resampled = X_label\n",
" y_resampled = y_label\n",
" print(f\" ➡️ 标签 {label}: {current_count} (无需过采样)\")\n",
" else:\n",
" # 保持不变\n",
" X_resampled = X_label\n",
" y_resampled = y_label\n",
" print(f\" ✅ 标签 {label}: {current_count} (已平衡)\")\n",
" \n",
" X_balanced.append(X_resampled)\n",
" y_balanced.append(y_resampled)\n",
" \n",
" # 合并所有平衡后的数据\n",
" X_balanced = np.vstack(X_balanced)\n",
" y_balanced = np.hstack(y_balanced)\n",
" \n",
" # 随机打乱\n",
" shuffle_indices = np.random.permutation(len(y_balanced))\n",
" X_balanced = X_balanced[shuffle_indices]\n",
" y_balanced = y_balanced[shuffle_indices]\n",
" \n",
" print(f\" ✅ 平衡完成: {X_balanced.shape[0]:,} 样本\")\n",
" print(f\" 数据变化: {X.shape[0]:,} → {X_balanced.shape[0]:,} ({X_balanced.shape[0]/X.shape[0]:.2f}x)\")\n",
" \n",
" return X_balanced, y_balanced\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🔄 集成数据平衡的内存友好数据加载器"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🧪 数据平衡效果测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🚀 改进版智能数据处理管道"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🚀 创建智能数据处理管道...\n",
"✅ 管道创建完成准备执行步骤1...\n"
]
}
],
"source": [
"# 🚀 改进版智能数据处理管道【没有解决分批训练的问题】\n",
"# 流程:分析分布 → 确定采样比率 → 拟合PCA只下采样 → 数据处理(下采样+上采样+PCA\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import Counter\n",
"from sklearn.utils import resample\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"import joblib\n",
"import random\n",
"import gc\n",
"\n",
"class SmartDataPipeline:\n",
" \"\"\"\n",
" 智能数据处理管道\n",
" 步骤1: 分析数据分布,确定采样策略\n",
" 步骤2: 仅下采样拟合PCA参数\n",
" 步骤3: 数据处理时应用完整采样+PCA降维\n",
" \"\"\"\n",
" \n",
" def __init__(self, data_dir, random_state=42):\n",
" self.data_dir = data_dir\n",
" self.random_state = random_state\n",
" \n",
" # 步骤1: 分布分析结果\n",
" self.distribution_analysis = None\n",
" self.sampling_strategy = None\n",
" \n",
" # 步骤2: PCA参数基于下采样数据拟合\n",
" self.pca_scaler = None\n",
" self.pca_model = None\n",
" self.pca_components = None\n",
" self.pca_fitted = False\n",
" \n",
" # 配置参数\n",
" self.undersample_labels = [0, 40] # 需要下采样的标签\n",
" self.oversample_threshold = 0.5 # 过采样阈值(相对于均值)\n",
" self.pca_variance_threshold = 0.95 # PCA保留方差比例\n",
" self.pca_sample_size = 15000 # PCA拟合样本数\n",
" \n",
" def step1_analyze_distribution(self, max_samples=100000):\n",
" \"\"\"\n",
" 步骤1: 分析数据分布,确定采样策略\n",
" \"\"\"\n",
" print(\"🔍 步骤1: 分析数据分布...\")\n",
" \n",
" # 分析验证集分布(代表整体分布特征)\n",
" all_labels = []\n",
" for trials_batch, filename in load_data_batch(self.data_dir, 'val', 5000):\n",
" _, labels = extract_features_labels_batch(trials_batch)\n",
" all_labels.extend(labels.tolist())\n",
" if len(all_labels) >= max_samples:\n",
" break\n",
" \n",
" # 统计分析\n",
" label_counts = Counter(all_labels)\n",
" \n",
" # 计算1-39标签的均值排除0和40\n",
" counts_1_39 = [label_counts.get(i, 0) for i in range(1, 40)]\n",
" target_mean = np.mean(counts_1_39)\n",
" \n",
" # 生成采样策略\n",
" sampling_strategy = {}\n",
" for label in range(41):\n",
" current_count = label_counts.get(label, 0)\n",
" \n",
" if label in self.undersample_labels:\n",
" # 下采样到均值水平\n",
" target_count = int(target_mean)\n",
" action = 'undersample' if current_count > target_count else 'keep'\n",
" elif current_count < target_mean * self.oversample_threshold:\n",
" # 过采样到阈值水平\n",
" target_count = int(target_mean * self.oversample_threshold)\n",
" action = 'oversample' if current_count < target_count else 'keep'\n",
" else:\n",
" # 保持不变\n",
" target_count = current_count\n",
" action = 'keep'\n",
" \n",
" sampling_strategy[label] = {\n",
" 'current_count': current_count,\n",
" 'target_count': target_count,\n",
" 'action': action\n",
" }\n",
" \n",
" self.distribution_analysis = {\n",
" 'label_counts': label_counts,\n",
" 'target_mean': target_mean,\n",
" 'total_samples': len(all_labels)\n",
" }\n",
" self.sampling_strategy = sampling_strategy\n",
" \n",
" print(f\" ✅ 分析完成: {len(all_labels):,} 样本\")\n",
" print(f\" 📊 标签1-39均值: {target_mean:.0f}\")\n",
" print(f\" 📉 下采样标签: {self.undersample_labels} → {target_mean:.0f}\")\n",
" print(f\" 📈 过采样阈值: {self.oversample_threshold} × 均值 = {target_mean * self.oversample_threshold:.0f}\")\n",
" \n",
" return self.distribution_analysis, self.sampling_strategy\n",
"\n",
"# 创建智能数据处理管道\n",
"print(\"🚀 创建智能数据处理管道...\")\n",
"pipeline = SmartDataPipeline(data_dir, random_state=42)\n",
"print(\"✅ 管道创建完成准备执行步骤1...\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 步骤2方法已添加到管道\n"
]
}
],
"source": [
"# 继续添加智能管道的其他方法【管道完善】\n",
"\n",
"def step2_fit_pca_with_undersampling(self):\n",
" \"\"\"\n",
" 步骤2: 仅对下采样数据拟合PCA参数不进行过采样避免PCA被过采样影响\n",
" \"\"\"\n",
" if self.sampling_strategy is None:\n",
" raise ValueError(\"请先执行步骤1: step1_analyze_distribution()\")\n",
" \n",
" print(\"\\n🔧 步骤2: 拟合PCA参数仅下采样不过采样...\")\n",
" \n",
" # 🔍 优先检查是否存在已保存的PCA模型\n",
" pca_path = \"smart_pipeline_pca.joblib\"\n",
" if os.path.exists(pca_path):\n",
" print(f\" 📁 发现已存在的PCA模型文件: {pca_path}\")\n",
" try:\n",
" # 加载已保存的PCA模型\n",
" pca_data = joblib.load(pca_path)\n",
" \n",
" self.pca_scaler = pca_data['scaler']\n",
" self.pca_model = pca_data['pca']\n",
" self.pca_components = pca_data['components']\n",
" self.pca_fitted = True\n",
" \n",
" print(f\" ✅ PCA模型加载成功!\")\n",
" print(f\" 降维: 7168 → {self.pca_components}\")\n",
" print(f\" 降维比例: {self.pca_components/7168:.2%}\")\n",
" print(f\" 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
" print(f\" 💡 跳过PCA拟合步骤使用已保存的模型\")\n",
" return\n",
" \n",
" except Exception as e:\n",
" print(f\" ⚠️ PCA模型加载失败: {e}\")\n",
" print(f\" 🔄 将重新拟合PCA模型...\")\n",
" else:\n",
" print(f\" 📄 未找到已保存的PCA模型文件: {pca_path}\")\n",
" print(f\" 🔄 将从头拟合PCA模型...\")\n",
" \n",
" # 收集用于PCA拟合的样本只下采样不过采样\n",
" pca_features = []\n",
" collected_samples = 0\n",
" \n",
" for trials_batch, filename in load_data_batch(self.data_dir, 'train', 3000):\n",
" features, labels = extract_features_labels_batch(trials_batch)\n",
" \n",
" # 对当前批次应用仅下采样策略\n",
" downsampled_features, downsampled_labels = self._apply_undersampling_only(features, labels)\n",
" \n",
" if downsampled_features.shape[0] > 0:\n",
" pca_features.append(downsampled_features)\n",
" collected_samples += downsampled_features.shape[0]\n",
" \n",
" if collected_samples >= self.pca_sample_size:\n",
" break\n",
" \n",
" if pca_features:\n",
" # 合并样本\n",
" X_pca_sample = np.vstack(pca_features)[:self.pca_sample_size]\n",
" print(f\" 📦 PCA拟合样本: {X_pca_sample.shape[0]:,} 个下采样样本\")\n",
" print(f\" 🔢 原始特征维度: {X_pca_sample.shape[1]}\")\n",
" \n",
" # 标准化\n",
" self.pca_scaler = StandardScaler()\n",
" X_scaled = self.pca_scaler.fit_transform(X_pca_sample)\n",
" \n",
" # 确定PCA成分数\n",
" pca_full = PCA(random_state=self.random_state)\n",
" pca_full.fit(X_scaled)\n",
" cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)\n",
" optimal_components = np.argmax(cumsum_var >= self.pca_variance_threshold) + 1\n",
" self.pca_components = min(optimal_components, X_pca_sample.shape[1])\n",
" \n",
" # 拟合最终PCA\n",
" self.pca_model = PCA(n_components=self.pca_components, random_state=self.random_state)\n",
" self.pca_model.fit(X_scaled)\n",
" self.pca_fitted = True\n",
" \n",
" # 保存PCA模型\n",
" pca_path = \"smart_pipeline_pca.joblib\"\n",
" joblib.dump({\n",
" 'scaler': self.pca_scaler,\n",
" 'pca': self.pca_model,\n",
" 'components': self.pca_components\n",
" }, pca_path)\n",
" \n",
" print(f\" ✅ PCA拟合完成!\")\n",
" print(f\" 降维: {X_pca_sample.shape[1]} → {self.pca_components}\")\n",
" print(f\" 降维比例: {self.pca_components/X_pca_sample.shape[1]:.2%}\")\n",
" print(f\" 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
" print(f\" 模型保存: {pca_path}\")\n",
" \n",
" # 清理内存\n",
" del pca_features, X_pca_sample, X_scaled\n",
" gc.collect()\n",
" else:\n",
" raise ValueError(\"无法收集PCA拟合样本\")\n",
"\n",
"def _apply_undersampling_only(self, X, y):\n",
" \"\"\"\n",
" 仅应用下采样策略用于PCA拟合\n",
" \"\"\"\n",
" X_result = []\n",
" y_result = []\n",
" \n",
" np.random.seed(self.random_state)\n",
" \n",
" for label in range(41):\n",
" label_mask = (y == label)\n",
" X_label = X[label_mask]\n",
" y_label = y[label_mask]\n",
" current_count = len(y_label)\n",
" \n",
" if current_count == 0:\n",
" continue\n",
" \n",
" strategy = self.sampling_strategy[label]\n",
" \n",
" if strategy['action'] == 'undersample' and current_count > strategy['target_count']:\n",
" # 下采样\n",
" indices = np.random.choice(current_count, strategy['target_count'], replace=False)\n",
" X_resampled = X_label[indices]\n",
" y_resampled = y_label[indices]\n",
" else:\n",
" # 保持原样\n",
" X_resampled = X_label\n",
" y_resampled = y_label\n",
" \n",
" X_result.append(X_resampled)\n",
" y_result.append(y_resampled)\n",
" \n",
" if X_result:\n",
" return np.vstack(X_result), np.hstack(y_result)\n",
" else:\n",
" return np.array([]).reshape(0, X.shape[1]), np.array([])\n",
"\n",
"# 动态添加方法到类\n",
"SmartDataPipeline.step2_fit_pca_with_undersampling = step2_fit_pca_with_undersampling\n",
"SmartDataPipeline._apply_undersampling_only = _apply_undersampling_only\n",
"\n",
"print(\"✅ 步骤2方法已添加到管道\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 所有方法已添加到智能管道\n",
"\n",
"📋 智能数据处理管道状态:\n",
" 🔍 步骤1 - 分布分析: ❌ 未完成\n",
" 🔧 步骤2 - PCA拟合: ❌ 未完成\n",
"\n",
"🎯 使用流程:\n",
" 1. pipeline.step1_analyze_distribution()\n",
" 2. pipeline.step2_fit_pca_with_undersampling()\n",
" 3. pipeline.step3_process_data('train') # 训练集\n",
" pipeline.step3_process_data('val') # 验证集\n"
]
}
],
"source": [
"# 添加智能管道的剩余方法\n",
"\n",
"def _apply_full_sampling(self, X, y):\n",
" \"\"\"\n",
" 应用完整的采样策略(下采样+过采样)\n",
" \"\"\"\n",
" X_result = []\n",
" y_result = []\n",
" \n",
" np.random.seed(self.random_state)\n",
" \n",
" for label in range(41):\n",
" label_mask = (y == label)\n",
" X_label = X[label_mask]\n",
" y_label = y[label_mask]\n",
" current_count = len(y_label)\n",
" \n",
" if current_count == 0:\n",
" continue\n",
" \n",
" strategy = self.sampling_strategy[label]\n",
" target_count = strategy['target_count']\n",
" \n",
" if strategy['action'] == 'undersample' and current_count > target_count:\n",
" # 下采样\n",
" indices = np.random.choice(current_count, target_count, replace=False)\n",
" X_resampled = X_label[indices]\n",
" y_resampled = y_label[indices]\n",
" elif strategy['action'] == 'oversample' and current_count < target_count:\n",
" # 过采样\n",
" X_resampled, y_resampled = resample(\n",
" X_label, y_label, \n",
" n_samples=target_count, \n",
" random_state=self.random_state\n",
" )\n",
" else:\n",
" # 保持原样\n",
" X_resampled = X_label\n",
" y_resampled = y_label\n",
" \n",
" X_result.append(X_resampled)\n",
" y_result.append(y_resampled)\n",
" \n",
" if X_result:\n",
" return np.vstack(X_result), np.hstack(y_result)\n",
" else:\n",
" return np.array([]).reshape(0, X.shape[1]), np.array([])\n",
"\n",
"def _apply_pca_transform(self, X):\n",
" \"\"\"\n",
" 应用PCA变换\n",
" \"\"\"\n",
" if not self.pca_fitted:\n",
" return X\n",
" \n",
" X_scaled = self.pca_scaler.transform(X)\n",
" X_pca = self.pca_model.transform(X_scaled)\n",
" return X_pca\n",
"\n",
"def step3_process_data(self, data_type, apply_sampling=None):\n",
" \"\"\"\n",
" 步骤3: 处理数据(采样+PCA降维\n",
" \n",
" Args:\n",
" data_type: 'train', 'val', 'test'\n",
" apply_sampling: 是否应用采样策略None=训练集应用,验证/测试集不应用\n",
" \"\"\"\n",
" if not self.pca_fitted:\n",
" raise ValueError(\"请先执行步骤2: step2_fit_pca_with_undersampling()\")\n",
" \n",
" if apply_sampling is None:\n",
" apply_sampling = (data_type == 'train')\n",
" \n",
" print(f\"\\n🔄 步骤3: 处理{data_type}数据...\")\n",
" print(f\" 采样策略: {'启用' if apply_sampling else '禁用'}\")\n",
" \n",
" all_features = []\n",
" all_labels = []\n",
" \n",
" for trials_batch, filename in load_data_batch(self.data_dir, data_type, 3000):\n",
" features, labels = extract_features_labels_batch(trials_batch)\n",
" \n",
" # 应用采样策略\n",
" if apply_sampling:\n",
" features_sampled, labels_sampled = self._apply_full_sampling(features, labels)\n",
" else:\n",
" features_sampled, labels_sampled = features, labels\n",
" \n",
" # 应用PCA降维\n",
" if features_sampled.shape[0] > 0:\n",
" features_pca = self._apply_pca_transform(features_sampled)\n",
" all_features.append(features_pca)\n",
" all_labels.append(labels_sampled)\n",
" \n",
" if all_features:\n",
" X = np.vstack(all_features)\n",
" y = np.hstack(all_labels)\n",
" \n",
" # 随机打乱\n",
" shuffle_indices = np.random.permutation(len(y))\n",
" X = X[shuffle_indices]\n",
" y = y[shuffle_indices]\n",
" \n",
" print(f\" ✅ 处理完成: {X.shape[0]:,} 样本, {X.shape[1]} 特征\")\n",
" \n",
" # 清理内存\n",
" del all_features, all_labels\n",
" gc.collect()\n",
" \n",
" return X, y\n",
" else:\n",
" return None, None\n",
"\n",
"def print_summary(self):\n",
" \"\"\"\n",
" 打印管道状态总结\n",
" \"\"\"\n",
" print(\"\\n📋 智能数据处理管道状态:\")\n",
" print(f\" 🔍 步骤1 - 分布分析: {'✅ 完成' if self.distribution_analysis else '❌ 未完成'}\")\n",
" print(f\" 🔧 步骤2 - PCA拟合: {'✅ 完成' if self.pca_fitted else '❌ 未完成'}\")\n",
" \n",
" if self.distribution_analysis:\n",
" target_mean = self.distribution_analysis['target_mean']\n",
" print(f\" 📊 标签1-39均值: {target_mean:.0f}\")\n",
" \n",
" if self.pca_fitted:\n",
" print(f\" 🔬 PCA降维: 7168 → {self.pca_components} ({self.pca_components/7168:.1%})\")\n",
" print(f\" 📈 保留方差: {self.pca_model.explained_variance_ratio_.sum():.4f}\")\n",
" \n",
" print(f\"\\n🎯 使用流程:\")\n",
" print(f\" 1. pipeline.step1_analyze_distribution()\")\n",
" print(f\" 2. pipeline.step2_fit_pca_with_undersampling()\")\n",
" print(f\" 3. pipeline.step3_process_data('train') # 训练集\")\n",
" print(f\" pipeline.step3_process_data('val') # 验证集\")\n",
"\n",
"# 动态添加剩余方法到类\n",
"SmartDataPipeline._apply_full_sampling = _apply_full_sampling\n",
"SmartDataPipeline._apply_pca_transform = _apply_pca_transform\n",
"SmartDataPipeline.step3_process_data = step3_process_data\n",
"SmartDataPipeline.print_summary = print_summary\n",
"\n",
"print(\"✅ 所有方法已添加到智能管道\")\n",
"pipeline.print_summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🔥 执行智能数据处理管道"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🚀 开始执行智能数据处理管道...\n",
"============================================================\n",
"\n",
"======================🔍 STEP 1: 分析数据分布======================\n",
"🔍 步骤1: 分析数据分布...\n",
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n",
" ✅ 分析完成: 108,742 样本\n",
" 📊 标签1-39均值: 455\n",
" 📉 下采样标签: [0, 40] → 455\n",
" 📈 过采样阈值: 0.5 × 均值 = 227\n",
"\n",
"📊 采样策略总结:\n",
" 📉 下采样标签: 2 个\n",
" 📈 过采样标签: 11 个\n",
" ✅ 保持不变: 28 个\n",
"\n",
"✅ 步骤1完成!\n"
]
}
],
"source": [
"# 🔥 执行智能数据处理管道【确定采样策略】\n",
"\n",
"print(\"🚀 开始执行智能数据处理管道...\")\n",
"print(\"=\" * 60)\n",
"\n",
"# 步骤1: 分析数据分布\n",
"print(\"\\n\" + \"🔍 STEP 1: 分析数据分布\".center(60, \"=\"))\n",
"distribution, strategy = pipeline.step1_analyze_distribution()\n",
"\n",
"# 显示采样策略总结\n",
"print(f\"\\n📊 采样策略总结:\")\n",
"undersample_count = sum(1 for s in strategy.values() if s['action'] == 'undersample')\n",
"oversample_count = sum(1 for s in strategy.values() if s['action'] == 'oversample')\n",
"keep_count = sum(1 for s in strategy.values() if s['action'] == 'keep')\n",
"\n",
"print(f\" 📉 下采样标签: {undersample_count} 个\")\n",
"print(f\" 📈 过采样标签: {oversample_count} 个\") \n",
"print(f\" ✅ 保持不变: {keep_count} 个\")\n",
"\n",
"print(\"\\n✅ 步骤1完成!\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=====================🔧 STEP 2: 拟合PCA参数======================\n",
"\n",
"🔧 步骤2: 拟合PCA参数仅下采样不过采样...\n",
" 📁 发现已存在的PCA模型文件: smart_pipeline_pca.joblib\n",
" ✅ PCA模型加载成功!\n",
" 降维: 7168 → 1219\n",
" 降维比例: 17.01%\n",
" 保留方差: 0.9491\n",
" 💡 跳过PCA拟合步骤使用已保存的模型\n",
"\n",
"✅ 步骤2完成!\n",
"\n",
"📋 智能数据处理管道状态:\n",
" 🔍 步骤1 - 分布分析: ✅ 完成\n",
" 🔧 步骤2 - PCA拟合: ✅ 完成\n",
" 📊 标签1-39均值: 455\n",
" 🔬 PCA降维: 7168 → 1219 (17.0%)\n",
" 📈 保留方差: 0.9491\n",
"\n",
"🎯 使用流程:\n",
" 1. pipeline.step1_analyze_distribution()\n",
" 2. pipeline.step2_fit_pca_with_undersampling()\n",
" 3. pipeline.step3_process_data('train') # 训练集\n",
" pipeline.step3_process_data('val') # 验证集\n"
]
}
],
"source": [
"# 步骤2: 拟合PCA参数【确定PCA策略】\n",
"print(\"\\n\" + \"🔧 STEP 2: 拟合PCA参数\".center(60, \"=\"))\n",
"pipeline.step2_fit_pca_with_undersampling()\n",
"\n",
"print(\"\\n✅ 步骤2完成!\")\n",
"pipeline.print_summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🚀 使用智能管道进行分批训练"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🚀 创建智能分批训练器...\n",
"🎯 智能分批训练器创建完成\n",
" 🔧 LightGBM参数已配置CPU模式\n",
" 💡 学习率调度: 带重启的余弦退火 (从 0.08 到 0.001)\n",
" 🔄 重启参数: T_0=50, T_mult=2\n",
"✅ 训练器创建完成,准备开始训练!\n"
]
}
],
"source": [
"# 🚀 使用智能管道进行分批训练\n",
"\n",
"import lightgbm as lgb\n",
"import time\n",
"from collections import Counter\n",
"import matplotlib.pyplot as plt\n",
"\n",
"class SmartBatchTrainer:\n",
" \"\"\"\n",
" 智能分批训练器,集成智能数据管道\n",
" \"\"\"\n",
" \n",
" def __init__(self, pipeline, params=None, min_learning_rate=1e-4, t_0=50, t_mult=2):\n",
" self.pipeline = pipeline\n",
" self.model = None\n",
" self.training_history = [] # 改为字典,因为只有一次训练\n",
" self.batch_count = 0\n",
" self.min_learning_rate = min_learning_rate\n",
" self.lr_history = [] # 用于可视化\n",
" \n",
" # 带重启的余弦退火参数\n",
" self.t_0 = t_0 # 第一个重启周期的长度\n",
" self.t_mult = t_mult # 重启周期的乘数\n",
" \n",
" # 默认LightGBM参数GPU优化\n",
" self.params = params or {\n",
" 'objective': 'multiclass',\n",
" 'num_class': 41,\n",
" 'metric': 'multi_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'device_type': 'cpu',\n",
" # 'gpu_platform_id': 0,\n",
" # 'gpu_device_id': 0,\n",
" 'max_bin': 255,\n",
" 'num_leaves': 127,\n",
" 'learning_rate': 0.08, #默认0.08\n",
" 'feature_fraction': 0.8,\n",
" 'bagging_fraction': 0.8,\n",
" 'bagging_freq': 5,\n",
" 'min_data_in_leaf': 20,\n",
" 'lambda_l1': 0.1,\n",
" 'lambda_l2': 0.1,\n",
" 'verbose': -1,\n",
" 'num_threads': -1\n",
" }\n",
" \n",
" self.initial_learning_rate = self.params.get('learning_rate', 0.08)\n",
" \n",
" print(f\"🎯 智能分批训练器创建完成\")\n",
" print(f\" 🔧 LightGBM参数已配置{self.params['device_type'].upper()}模式\")\n",
" print(f\" 💡 学习率调度: 带重启的余弦退火 (从 {self.initial_learning_rate} 到 {self.min_learning_rate})\")\n",
" print(f\" 🔄 重启参数: T_0={self.t_0}, T_mult={self.t_mult}\")\n",
" \n",
" def prepare_validation_data(self):\n",
" \"\"\"\n",
" 准备验证数据仅PCA保持原始分布\n",
" \"\"\"\n",
" print(\"🔄 准备验证数据...\")\n",
" self.X_val, self.y_val = self.pipeline.step3_process_data('val', apply_sampling=False)\n",
" if self.X_val is None:\n",
" raise ValueError(\"无法加载验证数据\")\n",
" val_counts = Counter(self.y_val)\n",
" print(f\" ✅ 验证数据准备完成: {self.X_val.shape[0]:,} 样本\")\n",
" print(f\" 📊 验证集分布 (标签0: {val_counts.get(0, 0):,}, 标签40: {val_counts.get(40, 0):,})\")\n",
"\n",
" return lgb.Dataset(self.X_val, label=self.y_val, free_raw_data=False)\n",
"\n",
" def get_training_batch_generator(self):\n",
" \"\"\"\n",
" 获取训练批次生成器(平衡采样+PCA\n",
" \"\"\"\n",
" print(\"🔄 准备训练批次生成器...\")\n",
" \n",
" # 使用管道的批次生成器\n",
" for trials_batch, filename in load_data_batch(self.pipeline.data_dir, 'train', 2000):\n",
" features, labels = extract_features_labels_batch(trials_batch)\n",
" \n",
" # 应用完整采样策略\n",
" features_sampled, labels_sampled = self.pipeline._apply_full_sampling(features, labels)\n",
" \n",
" # 应用PCA降维\n",
" if features_sampled.shape[0] > 0:\n",
" features_pca = self.pipeline._apply_pca_transform(features_sampled)\n",
" \n",
" # 分析当前批次分布\n",
" batch_counts = Counter(labels_sampled)\n",
" \n",
" print(f\" 📦 批次: {filename}\")\n",
" print(f\" 样本数: {features_pca.shape[0]:,}\")\n",
" print(f\" 平衡后分布: 标签0={batch_counts.get(0,0)}, 标签40={batch_counts.get(40,0)}\")\n",
" \n",
" yield lgb.Dataset(features_pca, label=labels_sampled), filename\n",
" \n",
" def prepare_full_data(self):\n",
" \"\"\"\n",
" 一次性准备所有训练和验证数据\n",
" \"\"\"\n",
" print(\"🔄 准备全量训练和验证数据...\")\n",
" \n",
" # 1. 准备验证数据 (保持原始分布)\n",
" X_val, y_val = self.pipeline.step3_process_data('val', apply_sampling=False)\n",
" if X_val is None:\n",
" raise ValueError(\"无法加载验证数据\")\n",
" val_counts = Counter(y_val)\n",
" print(f\" ✅ 验证数据准备完成: {X_val.shape[0]:,} 样本\")\n",
" print(f\" 📊 验证集分布 (标签0: {val_counts.get(0, 0):,}, 标签40: {val_counts.get(40, 0):,})\")\n",
" val_data = lgb.Dataset(X_val, label=y_val, free_raw_data=False)\n",
" \n",
" # 2. 准备训练数据 (应用完整采样和PCA策略)\n",
" X_train, y_train = self.pipeline.step3_process_data('train', apply_sampling=True)\n",
" if X_train is None:\n",
" raise ValueError(\"无法加载训练数据\")\n",
" train_counts = Counter(y_train)\n",
" print(f\" ✅ 训练数据准备完成: {X_train.shape[0]:,} 样本, {X_train.shape[1]} 特征\")\n",
" print(f\" 📊 训练集(采样后)分布 (标签0: {train_counts.get(0, 0):,}, 标签40: {train_counts.get(40, 0):,})\")\n",
" train_data = lgb.Dataset(X_train, label=y_train)\n",
" \n",
" return train_data, val_data, X_val, y_val\n",
" \n",
" def prepare_training_data(self):\n",
" \"\"\"\n",
" 准备训练数据仅PCA保持原始分布\n",
" \"\"\"\n",
" print(\"🔄 准备训练数据...\")\n",
" # 2. 准备训练数据 (应用完整采样和PCA策略)\n",
" X_train, y_train = self.pipeline.step3_process_data('train', apply_sampling=True)\n",
" if X_train is None:\n",
" raise ValueError(\"无法加载训练数据\")\n",
" train_counts = Counter(y_train)\n",
" print(f\" ✅ 训练数据准备完成: {X_train.shape[0]:,} 样本, {X_train.shape[1]} 特征\")\n",
" print(f\" 📊 训练集(采样后)分布 (标签0: {train_counts.get(0, 0):,}, 标签40: {train_counts.get(40, 0):,})\")\n",
" \n",
" return lgb.Dataset(X_train, label=y_train, free_raw_data=False)\n",
" \n",
" # 带重启的余弦退火调度器函数\n",
" def _cosine_annealing_with_warm_restarts(self, current_round):\n",
" \"\"\"\n",
" 带重启的余弦退火调度器 (SGDR)\n",
" \n",
" Args:\n",
" current_round: 当前训练轮数\n",
" \n",
" Returns:\n",
" 学习率\n",
" \"\"\"\n",
" eta_max = self.initial_learning_rate\n",
" eta_min = self.min_learning_rate\n",
" \n",
" # 计算当前在哪个重启周期中\n",
" t_cur = current_round\n",
" t_i = self.t_0\n",
" \n",
" # 找到当前的重启周期\n",
" cycle = 0\n",
" while t_cur >= t_i:\n",
" t_cur -= t_i\n",
" cycle += 1\n",
" t_i *= self.t_mult\n",
" \n",
" # 在当前周期内的位置\n",
" progress = t_cur / t_i\n",
" \n",
" # 计算学习率\n",
" lr = eta_min + 0.5 * (eta_max - eta_min) * (1 + np.cos(np.pi * progress))\n",
" \n",
" return lr\n",
" \n",
" def train_incremental(self, num_boost_round=100, early_stopping_rounds=10):\n",
" \"\"\"\n",
" 增量分批训练\n",
" \"\"\"\n",
" print(f\"\\n🚀 开始智能分批训练...\")\n",
" print(f\" 📝 训练轮数 (每批次): {num_boost_round}\")\n",
" print(f\" ⏹️ 早停轮数: {early_stopping_rounds}\")\n",
" print(\"=\" * 60)\n",
" \n",
" # 准备验证数据\n",
" val_data = self.prepare_validation_data()\n",
" \n",
" print(f\"\\n🔄 开始分批增量训练...\")\n",
" total_start_time = time.time()\n",
" \n",
" # ⭐️ 新增: 为学习率调度器定义T_max\n",
" # 我们将每个批次的训练视为一个完整的退火周期\n",
" t_max_per_batch = num_boost_round\n",
" \n",
" for train_data, filename in self.get_training_batch_generator():\n",
" self.batch_count += 1\n",
" batch_start_time = time.time()\n",
" self.last_batch_lr_history = [] # 重置每个批次的LR历史\n",
" \n",
" print(f\"\\n📈 批次 {self.batch_count}: {filename}\")\n",
" \n",
" # ⭐️ 新增: 创建学习率调度回调 和 记录回调\n",
" lr_scheduler_callback = lgb.reset_parameter(\n",
" learning_rate=lambda current_round: self._cosine_annealing_with_warm_restarts(current_round)\n",
" )\n",
"\n",
" # 这个简单的回调用于记录每个周期的学习率,以便后续可视化\n",
" def record_lr_callback(env):\n",
" self.last_batch_lr_history.append(env.model.params['learning_rate'])\n",
"\n",
" # 组合所有回调\n",
" training_callbacks = [\n",
" lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True),\n",
" lgb.log_evaluation(period=10), # 每10轮打印一次\n",
" lr_scheduler_callback,\n",
" record_lr_callback\n",
" ]\n",
"\n",
" # 训练当前批次\n",
" current_model_args = {\n",
" 'params': self.params,\n",
" 'train_set': train_data,\n",
" 'num_boost_round': num_boost_round,\n",
" 'valid_sets': [val_data],\n",
" 'valid_names': ['validation'],\n",
" 'callbacks': training_callbacks\n",
" }\n",
" \n",
" if self.model is None:\n",
" print(\" 🎯 初始模型训练...\")\n",
" self.model = lgb.train(**current_model_args)\n",
" else:\n",
" print(\" ⚡ 增量训练...\")\n",
" current_model_args['init_model'] = self.model\n",
" self.model = lgb.train(**current_model_args)\n",
"\n",
" # 记录训练历史\n",
" batch_time = time.time() - batch_start_time\n",
" \n",
" # 评估当前模型\n",
" val_pred = self.model.predict(self.X_val)\n",
" val_accuracy = (val_pred.argmax(axis=1) == self.y_val).mean()\n",
" \n",
" batch_info = {\n",
" 'batch': self.batch_count,\n",
" 'filename': filename,\n",
" 'time': batch_time,\n",
" 'val_accuracy': val_accuracy,\n",
" 'num_trees': self.model.num_trees(),\n",
" 'lr_history': self.last_batch_lr_history.copy() # 保存当前批次的LR历史\n",
" }\n",
" \n",
" self.training_history.append(batch_info)\n",
" \n",
" print(f\" ✅ 批次完成: {batch_time:.1f}秒\")\n",
" print(f\" 📊 验证准确率: {val_accuracy:.4f}\")\n",
" print(f\" 🌳 模型树数: {self.model.num_trees()}\")\n",
" \n",
" model_path = f\"smart_batch_model_batch_{self.batch_count}.txt\"\n",
" self.model.save_model(model_path)\n",
" print(f\" 💾 模型已保存: {model_path}\")\n",
" \n",
" total_time = time.time() - total_start_time\n",
" print(f\"\\n🎉 智能分批训练完成!\")\n",
" print(f\" ⏱️ 总训练时间: {total_time:.1f}秒\")\n",
" print(f\" 📊 处理批次数: {self.batch_count}\")\n",
" print(f\" 🌳 最终模型树数: {self.model.num_trees()}\")\n",
" \n",
" return self.model\n",
" \n",
" def train(self, num_boost_round=1000, early_stopping_rounds=50):\n",
" \"\"\"\n",
" 执行一次性全量训练\n",
" \"\"\"\n",
" print(f\"\\n🚀 开始全量数据训练...\")\n",
" print(f\" 📝 训练轮数: {num_boost_round}\")\n",
" print(f\" ⏹️ 早停轮数: {early_stopping_rounds}\")\n",
" print(\"=\" * 60)\n",
" \n",
" # 准备数据\n",
" train_data, val_data, X_val, y_val = self.prepare_full_data()\n",
" \n",
" start_time = time.time()\n",
" \n",
" # 定义学习率调度和记录回调\n",
" lr_scheduler_callback = lgb.reset_parameter(\n",
" learning_rate=lambda current_round: self._cosine_annealing_with_warm_restarts(current_round)\n",
" )\n",
" def record_lr_callback(env):\n",
" self.lr_history.append(env.model.params['learning_rate'])\n",
" \n",
" training_callbacks = [\n",
" lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True),\n",
" lgb.log_evaluation(period=1), # 每100轮打印日志\n",
" lr_scheduler_callback,\n",
" record_lr_callback\n",
" ]\n",
" \n",
" # 训练模型\n",
" print(\"\\n📈 开始模型训练...\")\n",
" self.model = lgb.train(\n",
" params=self.params,\n",
" train_set=train_data,\n",
" num_boost_round=num_boost_round,\n",
" valid_sets=[val_data],\n",
" valid_names=['validation'],\n",
" callbacks=training_callbacks\n",
" )\n",
" \n",
" training_time = time.time() - start_time\n",
" \n",
" # 评估模型\n",
" val_pred = self.model.predict(X_val)\n",
" val_accuracy = (val_pred.argmax(axis=1) == y_val).mean()\n",
" \n",
" # 记录训练历史\n",
" self.training_history = {\n",
" 'time': training_time,\n",
" 'val_accuracy': val_accuracy,\n",
" 'num_trees': self.model.num_trees(),\n",
" 'lr_history': self.lr_history,\n",
" 'best_iteration': self.model.best_iteration\n",
" }\n",
" \n",
" print(f\"\\n🎉 全量数据训练完成!\")\n",
" print(f\" ⏱️ 总训练时间: {training_time:.1f}秒\")\n",
" print(f\" 🌳 最终模型树数: {self.model.num_trees()} (最佳轮次: {self.model.best_iteration})\")\n",
" print(f\" 🎯 最终验证准确率: {val_accuracy:.4f}\")\n",
" \n",
" # 保存模型\n",
" model_path = \"full_train_model.txt\"\n",
" self.model.save_model(model_path)\n",
" print(f\" 💾 模型已保存: {model_path}\")\n",
" \n",
" return self.model\n",
" \n",
" def plot_training_progress(self):\n",
" \"\"\"\n",
" 绘制训练进度\n",
" \"\"\"\n",
" if not self.training_history:\n",
" print(\"❌ 没有训练历史记录\")\n",
" return\n",
" \n",
" # ⭐️ 修改: 增加学习率的可视化图表\n",
" fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 15))\n",
" \n",
" batches = [h['batch'] for h in self.training_history]\n",
" accuracies = [h['val_accuracy'] for h in self.training_history]\n",
" times = [h['time'] for h in self.training_history]\n",
" trees = [h['num_trees'] for h in self.training_history]\n",
" \n",
" # 1. 验证准确率\n",
" ax1.plot(batches, accuracies, 'b-o', linewidth=2, markersize=6)\n",
" ax1.set_xlabel('Training Batch')\n",
" ax1.set_ylabel('Validation Accuracy')\n",
" ax1.set_title('Validation Accuracy Progress')\n",
" ax1.grid(True, alpha=0.3)\n",
" ax1.set_ylim(0, 1)\n",
" \n",
" # 2. 批次训练时间\n",
" ax2.bar(batches, times, color='green', alpha=0.7)\n",
" ax2.set_xlabel('Training Batch')\n",
" ax2.set_ylabel('Training Time (seconds)')\n",
" ax2.set_title('Training Time per Batch')\n",
" ax2.grid(True, alpha=0.3)\n",
" \n",
" # 3. 模型树数增长\n",
" ax3.plot(batches, trees, 'r-s', linewidth=2, markersize=6)\n",
" ax3.set_xlabel('Training Batch')\n",
" ax3.set_ylabel('Number of Trees')\n",
" ax3.set_title('Model Complexity Growth')\n",
" ax3.grid(True, alpha=0.3)\n",
" \n",
" # 4. 累计准确率提升\n",
" ax4.plot(batches, [acc - accuracies[0] for acc in accuracies], 'purple', linewidth=2, marker='D')\n",
" ax4.set_xlabel('Training Batch')\n",
" ax4.set_ylabel('Accuracy Improvement')\n",
" ax4.set_title('Cumulative Accuracy Improvement')\n",
" ax4.grid(True, alpha=0.3)\n",
" ax4.axhline(y=0, color='black', linestyle='--', alpha=0.5)\n",
"\n",
" # ⭐️ 新增: 5. 最后一个批次的学习率曲线\n",
" last_lr_history = self.training_history[-1]['lr_history']\n",
" ax5.plot(range(len(last_lr_history)), last_lr_history, color='orange', marker='.')\n",
" ax5.set_xlabel('Boosting Round in Last Batch')\n",
" ax5.set_ylabel('Learning Rate')\n",
" ax5.set_title(f'Cosine Annealing LR in Last Batch (Batch {batches[-1]})')\n",
" ax5.grid(True, alpha=0.3)\n",
" \n",
" # 隐藏第六个子图\n",
" ax6.axis('off')\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
" \n",
" # 打印统计信息\n",
" print(f\"\\n📈 训练进度统计:\")\n",
" print(f\" 🎯 初始准确率: {accuracies[0]:.4f}\")\n",
" print(f\" 🎯 最终准确率: {accuracies[-1]:.4f}\")\n",
" print(f\" 📈 准确率提升: {accuracies[-1] - accuracies[0]:.4f}\")\n",
" print(f\" ⏱️ 平均批次时间: {np.mean(times):.1f}秒\")\n",
" print(f\" 🌳 最终模型树数: {trees[-1]}\")\n",
"\n",
"\n",
"print(\"🚀 创建智能分批训练器...\")\n",
"# 实例化时可以传入最小学习率\n",
"trainer = SmartBatchTrainer(pipeline, min_learning_rate=0.001) \n",
"print(\"✅ 训练器创建完成,准备开始训练!\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"📁 找到模型文件: smart_batch_model_batch_1.txt\n",
"✅ 模型加载成功!\n",
" 🌳 模型树数: 27593\n",
" 🔧 模型类型: LightGBM Booster\n",
" 📂 来源文件: smart_batch_model_batch_1.txt\n",
" 💡 提示: 运行验证评估需要先准备验证数据\n",
"\n",
"🎯 训练器状态更新:\n",
" ✅ 模型: 已加载 (27593 棵树)\n",
" 📊 管道: 已配置\n",
" 🔬 PCA: 已拟合\n",
"\n",
"💡 现在可以运行:\n",
" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\n",
" - 预测新数据\n",
" - 继续训练 (如果需要)\n",
"✅ 模型加载成功!\n",
" 🌳 模型树数: 27593\n",
" 🔧 模型类型: LightGBM Booster\n",
" 📂 来源文件: smart_batch_model_batch_1.txt\n",
" 💡 提示: 运行验证评估需要先准备验证数据\n",
"\n",
"🎯 训练器状态更新:\n",
" ✅ 模型: 已加载 (27593 棵树)\n",
" 📊 管道: 已配置\n",
" 🔬 PCA: 已拟合\n",
"\n",
"💡 现在可以运行:\n",
" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\n",
" - 预测新数据\n",
" - 继续训练 (如果需要)\n"
]
}
],
"source": [
"# 🔄 从文件加载已训练的模型\n",
"import lightgbm as lgb\n",
"import os\n",
"\n",
"model_path = \"smart_batch_model_batch_1.txt\"\n",
"\n",
"if os.path.exists(model_path):\n",
" print(f\"📁 找到模型文件: {model_path}\")\n",
" try:\n",
" # 加载LightGBM模型\n",
" trainer.model = lgb.Booster(model_file=model_path)\n",
" \n",
" print(f\"✅ 模型加载成功!\")\n",
" print(f\" 🌳 模型树数: {trainer.model.num_trees()}\")\n",
" print(f\" 🔧 模型类型: LightGBM Booster\")\n",
" print(f\" 📂 来源文件: {model_path}\")\n",
" \n",
" # 验证模型是否可用\n",
" if hasattr(trainer, 'X_val') and trainer.X_val is not None:\n",
" # 如果验证数据已准备,进行快速测试\n",
" test_pred = trainer.model.predict(trainer.X_val[:100]) # 测试前100个样本\n",
" print(f\" 🧪 模型测试: 预测形状 {test_pred.shape} (100样本 × 41类别)\")\n",
" else:\n",
" print(f\" 💡 提示: 运行验证评估需要先准备验证数据\")\n",
" \n",
" except Exception as e:\n",
" print(f\"❌ 模型加载失败: {e}\")\n",
" print(f\" 请检查文件是否完整或格式是否正确\")\n",
" trainer.model = None\n",
" \n",
"else:\n",
" print(f\"❌ 未找到模型文件: {model_path}\")\n",
" print(f\" 请确认文件路径是否正确\")\n",
" trainer.model = None\n",
"\n",
"# 显示当前训练器状态\n",
"if trainer.model is not None:\n",
" print(f\"\\n🎯 训练器状态更新:\")\n",
" print(f\" ✅ 模型: 已加载 ({trainer.model.num_trees()} 棵树)\")\n",
" print(f\" 📊 管道: {'已配置' if pipeline.pca_fitted else '需配置'}\")\n",
" print(f\" 🔬 PCA: {'已拟合' if pipeline.pca_fitted else '需拟合'}\")\n",
" \n",
" if pipeline.pca_fitted:\n",
" print(f\"\\n💡 现在可以运行:\")\n",
" print(f\" - 模型性能评估: evaluate_model_performance(trainer.model, pipeline, 'val')\")\n",
" print(f\" - 预测新数据\")\n",
" print(f\" - 继续训练 (如果需要)\")\n",
" else:\n",
" print(f\"\\n⚠ 注意: 需要先完成PCA配置才能使用模型进行预测\")\n",
"else:\n",
" print(f\"\\n❌ 模型加载失败trainer.model = None\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # 全量训练\n",
"\n",
"# print(\"🔥 开始智能分批训练!\")\n",
"# print(\"=\" * 80)\n",
"\n",
"# # 训练参数\n",
"# TRAINING_PARAMS = {\n",
"# 'num_boost_round': 500, # 每批次的提升轮数\n",
"# 'early_stopping_rounds': 15 # 早停轮数\n",
"# }\n",
"\n",
"# print(f\"📝 训练配置:\")\n",
"# print(f\" 训练轮数: {TRAINING_PARAMS['num_boost_round']}\")\n",
"# print(f\" 早停轮数: {TRAINING_PARAMS['early_stopping_rounds']}\")\n",
"# print(f\" 数据平衡: 启用下采样标签0,40 + 过采样少数类)\")\n",
"# print(f\" PCA降维: 7168 → {pipeline.pca_components} 特征\")\n",
"\n",
"# print(f\"\\n🚀 启动训练...\")\n",
"\n",
"# # 开始训练\n",
"# model = trainer.train(\n",
"# num_boost_round=TRAINING_PARAMS['num_boost_round'],\n",
"# early_stopping_rounds=TRAINING_PARAMS['early_stopping_rounds']\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔥 开始智能分批训练!\n",
"================================================================================\n",
"📝 训练配置:\n",
" 训练轮数: 300\n",
" 早停轮数: 15\n",
" 数据平衡: 启用下采样标签0,40 + 过采样少数类)\n",
" PCA降维: 7168 → 1219 特征\n",
"\n",
"🚀 启动训练...\n",
"\n",
"🚀 开始智能分批训练...\n",
" 📝 训练轮数 (每批次): 300\n",
" ⏹️ 早停轮数: 15\n",
"============================================================\n",
"🔄 准备验证数据...\n",
"\n",
"🔄 步骤3: 处理val数据...\n",
" 采样策略: 禁用\n",
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[13], line 19\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m🚀 启动训练...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# 开始训练\u001b[39;00m\n\u001b[1;32m---> 19\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_incremental\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_boost_round\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTRAINING_PARAMS\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_boost_round\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping_rounds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTRAINING_PARAMS\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mearly_stopping_rounds\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 22\u001b[0m \u001b[43m)\u001b[49m\n",
"Cell \u001b[1;32mIn[12], line 178\u001b[0m, in \u001b[0;36mSmartBatchTrainer.train_incremental\u001b[1;34m(self, num_boost_round, early_stopping_rounds)\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m)\n\u001b[0;32m 177\u001b[0m \u001b[38;5;66;03m# 准备验证数据\u001b[39;00m\n\u001b[1;32m--> 178\u001b[0m val_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_validation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 180\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m🔄 开始分批增量训练...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 181\u001b[0m total_start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n",
"Cell \u001b[1;32mIn[12], line 59\u001b[0m, in \u001b[0;36mSmartBatchTrainer.prepare_validation_data\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;124;03m准备验证数据仅PCA保持原始分布\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m🔄 准备验证数据...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 59\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_val, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39my_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep3_process_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mval\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mapply_sampling\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m无法加载验证数据\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"Cell \u001b[1;32mIn[9], line 80\u001b[0m, in \u001b[0;36mstep3_process_data\u001b[1;34m(self, data_type, apply_sampling)\u001b[0m\n\u001b[0;32m 77\u001b[0m all_features \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 78\u001b[0m all_labels \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m---> 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m trials_batch, filename \u001b[38;5;129;01min\u001b[39;00m load_data_batch(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata_dir, data_type, \u001b[38;5;241m3000\u001b[39m):\n\u001b[0;32m 81\u001b[0m features, labels \u001b[38;5;241m=\u001b[39m extract_features_labels_batch(trials_batch)\n\u001b[0;32m 83\u001b[0m \u001b[38;5;66;03m# 应用采样策略\u001b[39;00m\n",
"Cell \u001b[1;32mIn[5], line 30\u001b[0m, in \u001b[0;36mload_data_batch\u001b[1;34m(data_dir, data_type, max_samples_per_file)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 正在加载文件 \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_idx\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 29\u001b[0m data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mload(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(data_dir, f), allow_pickle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 30\u001b[0m trials \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mneural_logits_concatenated\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# 限制每个文件的样本数\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(trials) \u001b[38;5;241m>\u001b[39m max_samples_per_file \u001b[38;5;129;01mand\u001b[39;00m max_samples_per_file \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\_npyio_impl.py:258\u001b[0m, in \u001b[0;36mNpzFile.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m magic \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mMAGIC_PREFIX:\n\u001b[0;32m 257\u001b[0m \u001b[38;5;28mbytes\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mopen(key)\n\u001b[1;32m--> 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mbytes\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_pickle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mallow_pickle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43mpickle_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpickle_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_header_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_header_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 262\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mread(key)\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\format.py:827\u001b[0m, in \u001b[0;36mread_array\u001b[1;34m(fp, allow_pickle, pickle_kwargs, max_header_size)\u001b[0m\n\u001b[0;32m 825\u001b[0m pickle_kwargs \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 827\u001b[0m array \u001b[38;5;241m=\u001b[39m pickle\u001b[38;5;241m.\u001b[39mload(fp, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpickle_kwargs)\n\u001b[0;32m 828\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mUnicodeError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m 829\u001b[0m \u001b[38;5;66;03m# Friendlier error message\u001b[39;00m\n\u001b[0;32m 830\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mUnicodeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnpickling a python object failed: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 831\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou may need to pass the encoding= option \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 832\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto numpy.load\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (err,)) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01merr\u001b[39;00m\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:890\u001b[0m, in \u001b[0;36mZipExtFile.peek\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 888\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns buffered bytes without advancing the position.\"\"\"\u001b[39;00m\n\u001b[0;32m 889\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:\n\u001b[1;32m--> 890\u001b[0m chunk \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(chunk) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:\n\u001b[0;32m 892\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer \u001b[38;5;241m=\u001b[39m chunk \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset:]\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:930\u001b[0m, in \u001b[0;36mZipExtFile.read\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m 929\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m n \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof:\n\u001b[1;32m--> 930\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 931\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mlen\u001b[39m(data):\n\u001b[0;32m 932\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_readbuffer \u001b[38;5;241m=\u001b[39m data\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\zipfile.py:1006\u001b[0m, in \u001b[0;36mZipExtFile._read1\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compress_type \u001b[38;5;241m==\u001b[39m ZIP_DEFLATED:\n\u001b[0;32m 1005\u001b[0m n \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(n, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mMIN_READ_SIZE)\n\u001b[1;32m-> 1006\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decompressor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecompress\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1007\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decompressor\u001b[38;5;241m.\u001b[39meof \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1008\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compress_left \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[0;32m 1009\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decompressor\u001b[38;5;241m.\u001b[39munconsumed_tail)\n\u001b[0;32m 1010\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eof:\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"print(\"🔥 开始智能分批训练!\")\n",
"print(\"=\" * 80)\n",
"\n",
"# 训练参数\n",
"TRAINING_PARAMS = {\n",
" 'num_boost_round': 300, # 每批次的提升轮数\n",
" 'early_stopping_rounds': 15 # 早停轮数\n",
"}\n",
"\n",
"print(f\"📝 训练配置:\")\n",
"print(f\" 训练轮数: {TRAINING_PARAMS['num_boost_round']}\")\n",
"print(f\" 早停轮数: {TRAINING_PARAMS['early_stopping_rounds']}\")\n",
"print(f\" 数据平衡: 启用下采样标签0,40 + 过采样少数类)\")\n",
"print(f\" PCA降维: 7168 → {pipeline.pca_components} 特征\")\n",
"\n",
"print(f\"\\n🚀 启动训练...\")\n",
"\n",
"# 开始训练\n",
"model = trainer.train_incremental(\n",
" num_boost_round=TRAINING_PARAMS['num_boost_round'],\n",
" early_stopping_rounds=TRAINING_PARAMS['early_stopping_rounds']\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📊 训练结果分析"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🧪 模型性能评估"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🧪 开始模型性能评估...\n",
"🧪 评估模型在val数据集上的性能...\n",
"\n",
"🔄 步骤3: 处理val数据...\n",
" 采样策略: 禁用\n",
" 正在加载文件 1/41: t15.2023.08.13_val_concatenated.npz\n",
" 正在加载文件 2/41: t15.2023.08.18_val_concatenated.npz\n",
" 正在加载文件 3/41: t15.2023.08.20_val_concatenated.npz\n",
" 正在加载文件 4/41: t15.2023.08.25_val_concatenated.npz\n",
" 正在加载文件 5/41: t15.2023.08.27_val_concatenated.npz\n",
" 正在加载文件 6/41: t15.2023.09.01_val_concatenated.npz\n",
" 正在加载文件 7/41: t15.2023.09.03_val_concatenated.npz\n",
" 正在加载文件 8/41: t15.2023.09.24_val_concatenated.npz\n",
" 正在加载文件 9/41: t15.2023.09.29_val_concatenated.npz\n",
" 正在加载文件 10/41: t15.2023.10.01_val_concatenated.npz\n",
" 正在加载文件 11/41: t15.2023.10.06_val_concatenated.npz\n",
" 正在加载文件 12/41: t15.2023.10.08_val_concatenated.npz\n",
" 正在加载文件 13/41: t15.2023.10.13_val_concatenated.npz\n",
" 正在加载文件 14/41: t15.2023.10.15_val_concatenated.npz\n",
" 正在加载文件 15/41: t15.2023.10.20_val_concatenated.npz\n",
" 正在加载文件 16/41: t15.2023.10.22_val_concatenated.npz\n",
" 正在加载文件 17/41: t15.2023.11.03_val_concatenated.npz\n",
" 正在加载文件 18/41: t15.2023.11.04_val_concatenated.npz\n",
" 正在加载文件 19/41: t15.2023.11.17_val_concatenated.npz\n",
" 正在加载文件 20/41: t15.2023.11.19_val_concatenated.npz\n",
" 正在加载文件 21/41: t15.2023.11.26_val_concatenated.npz\n",
" 正在加载文件 22/41: t15.2023.12.03_val_concatenated.npz\n",
" 正在加载文件 23/41: t15.2023.12.08_val_concatenated.npz\n",
" 正在加载文件 24/41: t15.2023.12.10_val_concatenated.npz\n",
" 正在加载文件 25/41: t15.2023.12.17_val_concatenated.npz\n",
" 正在加载文件 26/41: t15.2023.12.29_val_concatenated.npz\n",
" 正在加载文件 27/41: t15.2024.02.25_val_concatenated.npz\n",
" 正在加载文件 28/41: t15.2024.03.08_val_concatenated.npz\n",
" 正在加载文件 29/41: t15.2024.03.15_val_concatenated.npz\n",
" 正在加载文件 30/41: t15.2024.03.17_val_concatenated.npz\n",
" 正在加载文件 31/41: t15.2024.05.10_val_concatenated.npz\n",
" 正在加载文件 32/41: t15.2024.06.14_val_concatenated.npz\n",
" 正在加载文件 33/41: t15.2024.07.19_val_concatenated.npz\n",
" 正在加载文件 34/41: t15.2024.07.21_val_concatenated.npz\n",
" 正在加载文件 35/41: t15.2024.07.28_val_concatenated.npz\n",
" 正在加载文件 36/41: t15.2025.01.10_val_concatenated.npz\n",
" 正在加载文件 37/41: t15.2025.01.12_val_concatenated.npz\n",
" 正在加载文件 38/41: t15.2025.03.14_val_concatenated.npz\n",
" 正在加载文件 39/41: t15.2025.03.16_val_concatenated.npz\n",
" 正在加载文件 40/41: t15.2025.03.30_val_concatenated.npz\n",
" 正在加载文件 41/41: t15.2025.04.13_val_concatenated.npz\n",
" ✅ 处理完成: 321,773 样本, 1219 特征\n",
" 📊 数据集大小: 321,773 样本, 1219 特征\n",
" ⏱️ 预测时间: 377.15秒\n",
" 🎯 整体准确率: 0.6695\n",
"\n",
"📊 标签分布对比:\n",
"标签 | 真实数量 | 预测数量 | 准确率\n",
"----------------------------------------\n",
" 0 | 238,705 | 216,091 | 0.806\n",
" 1 | 707 | 241 | 0.008\n",
" 2 | 787 | 238 | 0.009\n",
" 3 | 4,019 | 1,203 | 0.023\n",
" 4 | 612 | 286 | 0.026\n",
" 5 | 280 | 13 | 0.007\n",
" 6 | 1,102 | 2,519 | 0.191\n",
" 7 | 708 | 391 | 0.020\n",
" 8 | 257 | 10 | 0.000\n",
" 9 | 2,072 | 1,548 | 0.037\n",
" 10 | 1,562 | 1,744 | 0.111\n",
" 11 | 1,012 | 600 | 0.039\n",
" 12 | 991 | 356 | 0.008\n",
" 13 | 621 | 200 | 0.011\n",
" 14 | 818 | 408 | 0.013\n",
" 15 | 451 | 202 | 0.007\n",
" 16 | 794 | 3,307 | 0.174\n",
" 17 | 2,698 | 1,757 | 0.039\n",
" 18 | 1,926 | 2,019 | 0.046\n",
" 19 | 274 | 11 | 0.000\n",
" 20 | 1,676 | 2,640 | 0.063\n",
" 21 | 2,425 | 3,135 | 0.091\n",
" 22 | 1,354 | 1,625 | 0.061\n",
" 23 | 2,440 | 1,076 | 0.017\n",
" 24 | 656 | 143 | 0.009\n",
" 25 | 474 | 99 | 0.004\n",
" 26 | 221 | 4 | 0.000\n",
" 27 | 1,102 | 1,437 | 0.083\n",
" 28 | 2,416 | 4,056 | 0.119\n",
" 29 | 3,002 | 2,222 | 0.046\n",
" 30 | 251 | 10 | 0.004\n",
" 31 | 4,039 | 8,386 | 0.122\n",
" 32 | 413 | 142 | 0.046\n",
" 33 | 185 | 1 | 0.000\n",
" 34 | 1,028 | 1,062 | 0.090\n",
" 35 | 820 | 162 | 0.017\n",
" 36 | 1,310 | 290 | 0.015\n",
" 37 | 609 | 841 | 0.128\n",
" 38 | 1,429 | 1,365 | 0.082\n",
" 39 | 102 | 3 | 0.000\n",
" 40 | 35,425 | 59,930 | 0.567\n",
"\n",
"🔍 关键标签性能分析:\n",
" 标签 0 (下采样目标): 准确率 0.8065, 样本数 238,705\n",
" 标签 40 (下采样目标): 准确率 0.5673, 样本数 35,425\n",
" 少数类平均准确率 (前5个): 0.0000\n",
"\n",
"📈 预测置信度分析:\n",
" 平均置信度: 0.6461\n",
" 置信度中位数: 0.6850\n",
" 高置信度预测 (>0.9): 104,643 / 321,773 (32.52%)\n",
"\n",
"============================================================\n",
"🎉 智能分批训练+数据平衡 评估完成!\n",
"✅ 实现了数据平衡和PCA降维的完整流程\n",
"✅ 使用了内存友好的分批训练策略\n",
"✅ 保持了验证集的原始分布以确保评估客观性\n"
]
}
],
"source": [
"# 🧪 模型性能评估\n",
"\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"import numpy as np\n",
"\n",
"def evaluate_model_performance(model, pipeline, data_type='val'):\n",
" \"\"\"\n",
" 评估模型在指定数据集上的性能\n",
" \"\"\"\n",
" print(f\"🧪 评估模型在{data_type}数据集上的性能...\")\n",
" \n",
" # 加载数据\n",
" X, y = pipeline.step3_process_data(data_type, apply_sampling=False)\n",
" \n",
" if X is None or y is None:\n",
" print(f\"❌ 无法加载{data_type}数据\")\n",
" return None\n",
" \n",
" print(f\" 📊 数据集大小: {X.shape[0]:,} 样本, {X.shape[1]} 特征\")\n",
" \n",
" # 预测\n",
" start_time = time.time()\n",
" y_pred_proba = model.predict(X)\n",
" y_pred = y_pred_proba.argmax(axis=1)\n",
" pred_time = time.time() - start_time\n",
" \n",
" # 计算性能指标\n",
" accuracy = (y_pred == y).mean()\n",
" \n",
" print(f\" ⏱️ 预测时间: {pred_time:.2f}秒\")\n",
" print(f\" 🎯 整体准确率: {accuracy:.4f}\")\n",
" \n",
" # 分析各类别性能\n",
" from collections import Counter\n",
" true_counts = Counter(y)\n",
" pred_counts = Counter(y_pred)\n",
" \n",
" print(f\"\\n📊 标签分布对比:\")\n",
" print(\"标签 | 真实数量 | 预测数量 | 准确率\")\n",
" print(\"-\" * 40)\n",
" \n",
" label_accuracies = {}\n",
" for label in range(41):\n",
" if label in true_counts:\n",
" label_mask = (y == label)\n",
" if label_mask.sum() > 0:\n",
" label_acc = (y_pred[label_mask] == label).mean()\n",
" label_accuracies[label] = label_acc\n",
" true_count = true_counts.get(label, 0)\n",
" pred_count = pred_counts.get(label, 0)\n",
" print(f\"{label:4d} | {true_count:8,} | {pred_count:8,} | {label_acc:7.3f}\")\n",
" \n",
" # 重点分析关键标签\n",
" print(f\"\\n🔍 关键标签性能分析:\")\n",
" key_labels = [0, 40] # 下采样的标签\n",
" for label in key_labels:\n",
" if label in label_accuracies:\n",
" acc = label_accuracies[label]\n",
" count = true_counts.get(label, 0)\n",
" print(f\" 标签 {label} (下采样目标): 准确率 {acc:.4f}, 样本数 {count:,}\")\n",
" \n",
" # 少数类性能\n",
" minority_labels = [label for label, count in true_counts.items() \n",
" if count < 200 and label not in [0, 40]]\n",
" if minority_labels:\n",
" minority_accs = [label_accuracies.get(label, 0) for label in minority_labels[:5]]\n",
" avg_minority_acc = np.mean(minority_accs) if minority_accs else 0\n",
" print(f\" 少数类平均准确率 (前5个): {avg_minority_acc:.4f}\")\n",
" \n",
" # 置信度分析\n",
" max_proba = y_pred_proba.max(axis=1)\n",
" print(f\"\\n📈 预测置信度分析:\")\n",
" print(f\" 平均置信度: {max_proba.mean():.4f}\")\n",
" print(f\" 置信度中位数: {np.median(max_proba):.4f}\")\n",
" print(f\" 高置信度预测 (>0.9): {(max_proba > 0.9).sum():,} / {len(max_proba):,} ({(max_proba > 0.9).mean():.2%})\")\n",
" \n",
" return {\n",
" 'accuracy': accuracy,\n",
" 'prediction_time': pred_time,\n",
" 'label_accuracies': label_accuracies,\n",
" 'confidence_stats': {\n",
" 'mean': max_proba.mean(),\n",
" 'median': np.median(max_proba),\n",
" 'high_confidence_ratio': (max_proba > 0.9).mean()\n",
" }\n",
" }\n",
"\n",
"# 评估模型性能\n",
"if trainer.model:\n",
" print(\"🧪 开始模型性能评估...\")\n",
" \n",
" # 验证集评估\n",
" val_results = evaluate_model_performance(trainer.model, pipeline, 'val')\n",
" \n",
" print(f\"\\n\" + \"=\"*60)\n",
" print(\"🎉 智能分批训练+数据平衡 评估完成!\")\n",
" print(f\"✅ 实现了数据平衡和PCA降维的完整流程\")\n",
" print(f\"✅ 使用了内存友好的分批训练策略\")\n",
" print(f\"✅ 保持了验证集的原始分布以确保评估客观性\")\n",
"else:\n",
" print(\"❌ 模型尚未训练完成,请等待训练结束后运行此评估\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔍 开始在少量数据集上进行参数搜索...\n",
"================================================================================\n",
"📊 参数搜索配置:\n",
" 数据采样比例: 10.0%\n",
" 交叉验证折数: 3\n",
" 最大参数组合: 20\n",
" 参数空间大小: 729 种组合\n",
"\n",
"📦 准备少量数据集...\n"
]
},
{
"ename": "AttributeError",
"evalue": "'SmartDataPipeline' object has no attribute 'get_sample_data'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[19], line 37\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m📦 准备少量数据集...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# 使用已有的pipeline获取数据\u001b[39;00m\n\u001b[1;32m---> 37\u001b[0m sample_X, sample_y \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_sample_data\u001b[49m(ratio\u001b[38;5;241m=\u001b[39mPARAM_SEARCH_CONFIG[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msample_ratio\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 样本数量: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(sample_X)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 特征维度: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msample_X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mhasattr\u001b[39m(sample_X,\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnknown\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[1;31mAttributeError\u001b[0m: 'SmartDataPipeline' object has no attribute 'get_sample_data'"
]
}
],
"source": [
"print(\"🔍 开始在少量数据集上进行参数搜索...\")\n",
"print(\"=\" * 80)\n",
"\n",
"import itertools\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"\n",
"# 参数搜索配置\n",
"PARAM_SEARCH_CONFIG = {\n",
" 'sample_ratio': 0.1, # 使用10%的数据进行参数搜索\n",
" 'cv_folds': 3, # 3折交叉验证\n",
" 'max_combinations': 20 # 最多测试20种参数组合\n",
"}\n",
"\n",
"# 定义参数搜索空间\n",
"param_grid = {\n",
" 'learning_rate': [0.05, 0.1, 0.15],\n",
" 'num_leaves': [31, 63, 127],\n",
" 'feature_fraction': [0.8, 0.9, 1.0],\n",
" 'bagging_fraction': [0.8, 0.9, 1.0],\n",
" 'max_depth': [6, 8, 10],\n",
" 'min_data_in_leaf': [10, 20, 30]\n",
"}\n",
"\n",
"print(f\"📊 参数搜索配置:\")\n",
"print(f\" 数据采样比例: {PARAM_SEARCH_CONFIG['sample_ratio']*100}%\")\n",
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
"\n",
"# 获取少量数据用于参数搜索\n",
"print(f\"\\n📦 准备少量数据集...\")\n",
"\n",
"# 从验证集获取少量数据进行参数搜索\n",
"import os\n",
"data_dir = 'data/concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 只使用前几个文件进行快速参数搜索\n",
"sample_files = val_files[:max(1, int(len(val_files) * PARAM_SEARCH_CONFIG['sample_ratio']))]\n",
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
"\n",
"# 加载样本数据\n",
"sample_X_list = []\n",
"sample_y_list = []\n",
"\n",
"for file in sample_files[:3]: # 最多使用3个文件\n",
" file_path = os.path.join(data_dir, file)\n",
" try:\n",
" data = np.load(file_path)\n",
" features = data['features']\n",
" labels = data['labels']\n",
" \n",
" # 进一步采样以减少数据量\n",
" n_samples = min(2000, len(features)) # 每个文件最多2000样本\n",
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
" \n",
" sample_X_list.append(features[indices])\n",
" sample_y_list.append(labels[indices])\n",
" \n",
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
" except Exception as e:\n",
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
"\n",
"if len(sample_X_list) == 0:\n",
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
"\n",
"# 合并数据\n",
"sample_X = np.vstack(sample_X_list)\n",
"sample_y = np.hstack(sample_y_list)\n",
"\n",
"# 应用PCA变换\n",
"if hasattr(pipeline, 'pca_components') and GLOBAL_PCA['is_fitted']:\n",
" sample_X = apply_pca_transform(sample_X)\n",
"\n",
"print(f\" 总样本数量: {len(sample_X)}\")\n",
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
"print(f\" 标签分布: {np.bincount(sample_y)[:5]}... (前5个标签)\")\n",
"\n",
"# 生成参数组合\n",
"print(f\"\\n🎯 生成参数组合...\")\n",
"param_names = list(param_grid.keys())\n",
"param_values = list(param_grid.values())\n",
"\n",
"# 随机采样参数组合\n",
"np.random.seed(42)\n",
"all_combinations = list(itertools.product(*param_values))\n",
"np.random.shuffle(all_combinations)\n",
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
"\n",
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
"\n",
"# 参数搜索函数\n",
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
" \"\"\"评估参数组合的性能\"\"\"\n",
" try:\n",
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
" scores = []\n",
" \n",
" for train_idx, val_idx in skf.split(X, y):\n",
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
" \n",
" # 创建LightGBM数据集\n",
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
" \n",
" # 训练模型\n",
" model = lgb.train(\n",
" params_dict,\n",
" train_data,\n",
" valid_sets=[val_data],\n",
" num_boost_round=50, # 少量轮数快速评估\n",
" callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]\n",
" )\n",
" \n",
" # 预测和评估\n",
" y_pred = model.predict(X_val_fold)\n",
" y_pred_binary = (y_pred > 0.5).astype(int)\n",
" score = accuracy_score(y_val_fold, y_pred_binary)\n",
" scores.append(score)\n",
" \n",
" return np.mean(scores), np.std(scores)\n",
" except Exception as e:\n",
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
" return 0.0, 1.0\n",
"\n",
"# 开始参数搜索\n",
"print(f\"\\n🚀 开始参数搜索...\")\n",
"best_score = 0\n",
"best_params = None\n",
"best_std = 1.0\n",
"results = []\n",
"\n",
"for i, combination in enumerate(selected_combinations):\n",
" params_dict = dict(zip(param_names, combination))\n",
" \n",
" # 添加固定参数\n",
" params_dict.update({\n",
" 'objective': 'binary',\n",
" 'metric': 'binary_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42\n",
" })\n",
" \n",
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
" print(f\" 参数: {params_dict}\")\n",
" \n",
" # 评估参数\n",
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y, PARAM_SEARCH_CONFIG['cv_folds'])\n",
" \n",
" results.append({\n",
" 'params': params_dict.copy(),\n",
" 'mean_score': mean_score,\n",
" 'std_score': std_score\n",
" })\n",
" \n",
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
" \n",
" # 更新最佳参数\n",
" if mean_score > best_score:\n",
" best_score = mean_score\n",
" best_params = params_dict.copy()\n",
" best_std = std_score\n",
" print(f\" ✨ 新的最佳参数!\")\n",
"\n",
"print(f\"\\n🏆 参数搜索完成!\")\n",
"print(f\"=\" * 80)\n",
"print(f\"🎯 最佳参数组合:\")\n",
"for key, value in best_params.items():\n",
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']:\n",
" print(f\" {key}: {value}\")\n",
"\n",
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
"\n",
"# 保存结果\n",
"print(f\"\\n💾 保存参数搜索结果...\")\n",
"BEST_PARAMS = best_params\n",
"PARAM_SEARCH_RESULTS = results\n",
"\n",
"print(f\" 最佳参数已保存到变量: BEST_PARAMS\")\n",
"print(f\" 所有结果已保存到变量: PARAM_SEARCH_RESULTS\")\n",
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
"\n",
"# 显示前5个最佳结果\n",
"print(f\"\\n🔝 Top 5 参数组合:\")\n",
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
"for i, result in enumerate(sorted_results[:5]):\n",
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
" key_params = {k: v for k, v in result['params'].items() \n",
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
" print(f\" 参数: {key_params}\")\n",
"\n",
"print(f\"\\n✅ 参数搜索完成!可以使用 BEST_PARAMS 进行后续训练\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔍 重新开始参数搜索(修复版)...\n",
"================================================================================\n",
"📊 参数搜索配置:\n",
" 数据采样比例: 10.0%\n",
" 交叉验证折数: 3\n",
" 最大参数组合: 10\n",
" 参数空间大小: 48 种组合\n",
"\n",
"📦 准备少量数据集...\n"
]
},
{
"ename": "FileNotFoundError",
"evalue": "[WinError 3] 系统找不到指定的路径。: 'data/concatenated_data'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[20], line 38\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# 从验证集获取少量数据进行参数搜索\u001b[39;00m\n\u001b[0;32m 37\u001b[0m data_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/concatenated_data\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m---> 38\u001b[0m val_files \u001b[38;5;241m=\u001b[39m [f \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m f\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_val_concatenated.npz\u001b[39m\u001b[38;5;124m'\u001b[39m)]\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# 只使用前2个文件进行快速参数搜索\u001b[39;00m\n\u001b[0;32m 41\u001b[0m sample_files \u001b[38;5;241m=\u001b[39m val_files[:\u001b[38;5;241m2\u001b[39m]\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [WinError 3] 系统找不到指定的路径。: 'data/concatenated_data'"
]
}
],
"source": [
"print(\"🔍 重新开始参数搜索(修复版)...\")\n",
"print(\"=\" * 80)\n",
"\n",
"import itertools\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"import os\n",
"\n",
"# 参数搜索配置\n",
"PARAM_SEARCH_CONFIG = {\n",
" 'sample_ratio': 0.1, # 使用10%的数据进行参数搜索\n",
" 'cv_folds': 3, # 3折交叉验证\n",
" 'max_combinations': 10 # 减少到10种参数组合以加快速度\n",
"}\n",
"\n",
"# 定义参数搜索空间(简化版)\n",
"param_grid = {\n",
" 'learning_rate': [0.05, 0.1, 0.15],\n",
" 'num_leaves': [31, 63],\n",
" 'feature_fraction': [0.8, 1.0],\n",
" 'bagging_fraction': [0.8, 1.0],\n",
" 'max_depth': [6, 8],\n",
"}\n",
"\n",
"print(f\"📊 参数搜索配置:\")\n",
"print(f\" 数据采样比例: {PARAM_SEARCH_CONFIG['sample_ratio']*100}%\")\n",
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
"\n",
"# 获取少量数据用于参数搜索\n",
"print(f\"\\n📦 准备少量数据集...\")\n",
"\n",
"# 从验证集获取少量数据进行参数搜索\n",
"data_dir = 'data/concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 只使用前2个文件进行快速参数搜索\n",
"sample_files = val_files[:2]\n",
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
"\n",
"# 加载样本数据\n",
"sample_X_list = []\n",
"sample_y_list = []\n",
"\n",
"for file in sample_files:\n",
" file_path = os.path.join(data_dir, file)\n",
" try:\n",
" data = np.load(file_path)\n",
" features = data['features']\n",
" labels = data['labels']\n",
" \n",
" # 进一步采样以减少数据量\n",
" n_samples = min(1000, len(features)) # 每个文件最多1000样本\n",
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
" \n",
" sample_X_list.append(features[indices])\n",
" sample_y_list.append(labels[indices])\n",
" \n",
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
" except Exception as e:\n",
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
"\n",
"if len(sample_X_list) == 0:\n",
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
"\n",
"# 合并数据\n",
"sample_X = np.vstack(sample_X_list)\n",
"sample_y = np.hstack(sample_y_list)\n",
"\n",
"# 应用PCA变换如果已经拟合\n",
"if GLOBAL_PCA['is_fitted']:\n",
" sample_X = apply_pca_transform(sample_X)\n",
"\n",
"print(f\" 总样本数量: {len(sample_X)}\")\n",
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
"\n",
"# 只保留前3个类别以简化问题\n",
"mask = sample_y < 3\n",
"sample_X = sample_X[mask]\n",
"sample_y = sample_y[mask]\n",
"\n",
"print(f\" 简化后样本数量: {len(sample_X)}\")\n",
"print(f\" 标签分布: {np.bincount(sample_y)}\")\n",
"\n",
"# 生成参数组合\n",
"print(f\"\\n🎯 生成参数组合...\")\n",
"param_names = list(param_grid.keys())\n",
"param_values = list(param_grid.values())\n",
"\n",
"# 随机采样参数组合\n",
"np.random.seed(42)\n",
"all_combinations = list(itertools.product(*param_values))\n",
"np.random.shuffle(all_combinations)\n",
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
"\n",
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
"\n",
"# 参数搜索函数\n",
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
" \"\"\"评估参数组合的性能\"\"\"\n",
" try:\n",
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
" scores = []\n",
" \n",
" for train_idx, val_idx in skf.split(X, y):\n",
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
" \n",
" # 创建LightGBM数据集\n",
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
" \n",
" # 训练模型\n",
" model = lgb.train(\n",
" params_dict,\n",
" train_data,\n",
" valid_sets=[val_data],\n",
" num_boost_round=50, # 少量轮数快速评估\n",
" callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]\n",
" )\n",
" \n",
" # 预测和评估\n",
" y_pred = model.predict(X_val_fold)\n",
" if len(np.unique(y)) > 2: # 多分类\n",
" y_pred_class = np.argmax(y_pred, axis=1)\n",
" else: # 二分类\n",
" y_pred_class = (y_pred > 0.5).astype(int)\n",
" \n",
" score = accuracy_score(y_val_fold, y_pred_class)\n",
" scores.append(score)\n",
" \n",
" return np.mean(scores), np.std(scores)\n",
" except Exception as e:\n",
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
" return 0.0, 1.0\n",
"\n",
"# 开始参数搜索\n",
"print(f\"\\n🚀 开始参数搜索...\")\n",
"best_score = 0\n",
"best_params = None\n",
"best_std = 1.0\n",
"results = []\n",
"\n",
"for i, combination in enumerate(selected_combinations):\n",
" params_dict = dict(zip(param_names, combination))\n",
" \n",
" # 添加固定参数\n",
" if len(np.unique(sample_y)) > 2:\n",
" params_dict.update({\n",
" 'objective': 'multiclass',\n",
" 'num_class': len(np.unique(sample_y)),\n",
" 'metric': 'multi_logloss',\n",
" })\n",
" else:\n",
" params_dict.update({\n",
" 'objective': 'binary',\n",
" 'metric': 'binary_logloss',\n",
" })\n",
" \n",
" params_dict.update({\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42\n",
" })\n",
" \n",
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
" key_params = {k: v for k, v in params_dict.items() \n",
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']}\n",
" print(f\" 参数: {key_params}\")\n",
" \n",
" # 评估参数\n",
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y, PARAM_SEARCH_CONFIG['cv_folds'])\n",
" \n",
" results.append({\n",
" 'params': params_dict.copy(),\n",
" 'mean_score': mean_score,\n",
" 'std_score': std_score\n",
" })\n",
" \n",
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
" \n",
" # 更新最佳参数\n",
" if mean_score > best_score:\n",
" best_score = mean_score\n",
" best_params = params_dict.copy()\n",
" best_std = std_score\n",
" print(f\" ✨ 新的最佳参数!\")\n",
"\n",
"print(f\"\\n🏆 参数搜索完成!\")\n",
"print(f\"=\" * 80)\n",
"print(f\"🎯 最佳参数组合:\")\n",
"for key, value in best_params.items():\n",
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']:\n",
" print(f\" {key}: {value}\")\n",
"\n",
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
"\n",
"# 保存结果\n",
"BEST_PARAMS = best_params\n",
"PARAM_SEARCH_RESULTS = results\n",
"\n",
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
"print(f\" 最佳参数变量: BEST_PARAMS\")\n",
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS\")\n",
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
"\n",
"# 显示前5个最佳结果\n",
"print(f\"\\n🔝 Top 5 参数组合:\")\n",
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
"for i, result in enumerate(sorted_results[:5]):\n",
" key_params = {k: v for k, v in result['params'].items() \n",
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed', 'num_class']}\n",
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
" print(f\" 参数: {key_params}\")\n",
"\n",
"print(f\"\\n✅ 参数搜索完成!可以使用 BEST_PARAMS 进行后续训练\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔍 参数搜索(使用正确路径)...\n",
"================================================================================\n",
"📊 参数搜索配置:\n",
" 交叉验证折数: 3\n",
" 最大参数组合: 8\n",
" 参数空间大小: 16 种组合\n",
"\n",
"📦 准备少量数据集...\n",
" 使用文件数: 2 / 41\n",
" ⚠️ 加载失败: t15.2023.08.13_val_concatenated.npz - 'features is not a file in the archive'\n",
" ⚠️ 加载失败: t15.2023.08.18_val_concatenated.npz - 'features is not a file in the archive'\n"
]
},
{
"ename": "ValueError",
"evalue": "无法加载任何数据文件进行参数搜索",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[21], line 64\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m ⚠️ 加载失败: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sample_X_list) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m---> 64\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m无法加载任何数据文件进行参数搜索\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 66\u001b[0m \u001b[38;5;66;03m# 合并数据\u001b[39;00m\n\u001b[0;32m 67\u001b[0m sample_X \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(sample_X_list)\n",
"\u001b[1;31mValueError\u001b[0m: 无法加载任何数据文件进行参数搜索"
]
}
],
"source": [
"print(\"🔍 参数搜索(使用正确路径)...\")\n",
"print(\"=\" * 80)\n",
"\n",
"import itertools\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"import os\n",
"\n",
"# 参数搜索配置\n",
"PARAM_SEARCH_CONFIG = {\n",
" 'cv_folds': 3, # 3折交叉验证\n",
" 'max_combinations': 8 # 减少到8种参数组合以加快速度\n",
"}\n",
"\n",
"# 定义参数搜索空间(简化版)\n",
"param_grid = {\n",
" 'learning_rate': [0.05, 0.1],\n",
" 'num_leaves': [31, 63],\n",
" 'feature_fraction': [0.8, 1.0],\n",
" 'max_depth': [6, 8],\n",
"}\n",
"\n",
"print(f\"📊 参数搜索配置:\")\n",
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
"\n",
"# 获取少量数据用于参数搜索\n",
"print(f\"\\n📦 准备少量数据集...\")\n",
"\n",
"# 使用绝对路径\n",
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 只使用前2个文件进行快速参数搜索\n",
"sample_files = val_files[:2]\n",
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
"\n",
"# 加载样本数据\n",
"sample_X_list = []\n",
"sample_y_list = []\n",
"\n",
"for file in sample_files:\n",
" file_path = os.path.join(data_dir, file)\n",
" try:\n",
" data = np.load(file_path)\n",
" features = data['features']\n",
" labels = data['labels']\n",
" \n",
" # 进一步采样以减少数据量\n",
" n_samples = min(500, len(features)) # 每个文件最多500样本\n",
" indices = np.random.choice(len(features), n_samples, replace=False)\n",
" \n",
" sample_X_list.append(features[indices])\n",
" sample_y_list.append(labels[indices])\n",
" \n",
" print(f\" 加载文件: {file} - {n_samples} 样本\")\n",
" except Exception as e:\n",
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
"\n",
"if len(sample_X_list) == 0:\n",
" raise ValueError(\"无法加载任何数据文件进行参数搜索\")\n",
"\n",
"# 合并数据\n",
"sample_X = np.vstack(sample_X_list)\n",
"sample_y = np.hstack(sample_y_list)\n",
"\n",
"# 应用PCA变换如果已经拟合\n",
"if GLOBAL_PCA['is_fitted']:\n",
" sample_X = apply_pca_transform(sample_X)\n",
"\n",
"print(f\" 总样本数量: {len(sample_X)}\")\n",
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
"\n",
"# 简化为二分类问题标签0 vs 其他\n",
"sample_y_binary = (sample_y == 0).astype(int)\n",
"print(f\" 二分类标签分布: {np.bincount(sample_y_binary)}\")\n",
"\n",
"# 生成参数组合\n",
"print(f\"\\n🎯 生成参数组合...\")\n",
"param_names = list(param_grid.keys())\n",
"param_values = list(param_grid.values())\n",
"\n",
"# 随机采样参数组合\n",
"np.random.seed(42)\n",
"all_combinations = list(itertools.product(*param_values))\n",
"np.random.shuffle(all_combinations)\n",
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
"\n",
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
"\n",
"# 参数搜索函数\n",
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
" \"\"\"评估参数组合的性能\"\"\"\n",
" try:\n",
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
" scores = []\n",
" \n",
" for train_idx, val_idx in skf.split(X, y):\n",
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
" \n",
" # 创建LightGBM数据集\n",
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
" \n",
" # 训练模型\n",
" model = lgb.train(\n",
" params_dict,\n",
" train_data,\n",
" valid_sets=[val_data],\n",
" num_boost_round=30, # 少量轮数快速评估\n",
" callbacks=[lgb.early_stopping(5), lgb.log_evaluation(0)]\n",
" )\n",
" \n",
" # 预测和评估\n",
" y_pred = model.predict(X_val_fold)\n",
" y_pred_class = (y_pred > 0.5).astype(int)\n",
" \n",
" score = accuracy_score(y_val_fold, y_pred_class)\n",
" scores.append(score)\n",
" \n",
" return np.mean(scores), np.std(scores)\n",
" except Exception as e:\n",
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
" return 0.0, 1.0\n",
"\n",
"# 开始参数搜索\n",
"print(f\"\\n🚀 开始参数搜索...\")\n",
"best_score = 0\n",
"best_params = None\n",
"best_std = 1.0\n",
"results = []\n",
"\n",
"for i, combination in enumerate(selected_combinations):\n",
" params_dict = dict(zip(param_names, combination))\n",
" \n",
" # 添加固定参数\n",
" params_dict.update({\n",
" 'objective': 'binary',\n",
" 'metric': 'binary_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42\n",
" })\n",
" \n",
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
" key_params = {k: v for k, v in params_dict.items() \n",
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
" print(f\" 参数: {key_params}\")\n",
" \n",
" # 评估参数\n",
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y_binary, PARAM_SEARCH_CONFIG['cv_folds'])\n",
" \n",
" results.append({\n",
" 'params': params_dict.copy(),\n",
" 'mean_score': mean_score,\n",
" 'std_score': std_score\n",
" })\n",
" \n",
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
" \n",
" # 更新最佳参数\n",
" if mean_score > best_score:\n",
" best_score = mean_score\n",
" best_params = params_dict.copy()\n",
" best_std = std_score\n",
" print(f\" ✨ 新的最佳参数!\")\n",
"\n",
"print(f\"\\n🏆 参数搜索完成!\")\n",
"print(f\"=\" * 80)\n",
"print(f\"🎯 最佳参数组合:\")\n",
"for key, value in best_params.items():\n",
" if key not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']:\n",
" print(f\" {key}: {value}\")\n",
"\n",
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
"\n",
"# 保存结果\n",
"BEST_PARAMS_SEARCH = best_params\n",
"PARAM_SEARCH_RESULTS = results\n",
"\n",
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
"print(f\" 最佳参数变量: BEST_PARAMS_SEARCH\")\n",
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS\")\n",
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
"\n",
"# 显示所有结果\n",
"print(f\"\\n🔝 所有参数组合结果:\")\n",
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
"for i, result in enumerate(sorted_results):\n",
" key_params = {k: v for k, v in result['params'].items() \n",
" if k not in ['objective', 'metric', 'boosting_type', 'verbosity', 'seed']}\n",
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
" print(f\" 参数: {key_params}\")\n",
"\n",
"print(f\"\\n✅ 参数搜索完成!推荐使用最佳参数进行后续训练\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"检查文件: t15.2023.08.13_val_concatenated.npz\n",
"文件中的键: ['neural_logits_concatenated', 'confidence_scores', 'pred_seq', 'block_num', 'trial_num', 'session', 'sentence_label', 'seq_class_ids', 'seq_len']\n"
]
},
{
"ename": "ValueError",
"evalue": "Object arrays cannot be loaded when allow_pickle=False",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[22], line 18\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# 显示数据形状\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdata[key]\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mhasattr\u001b[39m(\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m,\u001b[38;5;250m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mtype\u001b[39m(data[key])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 20\u001b[0m data\u001b[38;5;241m.\u001b[39mclose()\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\_npyio_impl.py:258\u001b[0m, in \u001b[0;36mNpzFile.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m magic \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mMAGIC_PREFIX:\n\u001b[0;32m 257\u001b[0m \u001b[38;5;28mbytes\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mopen(key)\n\u001b[1;32m--> 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mbytes\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_pickle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mallow_pickle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43mpickle_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpickle_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_header_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_header_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 262\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip\u001b[38;5;241m.\u001b[39mread(key)\n",
"File \u001b[1;32md:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\numpy\\lib\\format.py:822\u001b[0m, in \u001b[0;36mread_array\u001b[1;34m(fp, allow_pickle, pickle_kwargs, max_header_size)\u001b[0m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mhasobject:\n\u001b[0;32m 820\u001b[0m \u001b[38;5;66;03m# The array contained Python objects. We need to unpickle the data.\u001b[39;00m\n\u001b[0;32m 821\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m allow_pickle:\n\u001b[1;32m--> 822\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject arrays cannot be loaded when \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 823\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_pickle=False\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 824\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pickle_kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 825\u001b[0m pickle_kwargs \u001b[38;5;241m=\u001b[39m {}\n",
"\u001b[1;31mValueError\u001b[0m: Object arrays cannot be loaded when allow_pickle=False"
]
}
],
"source": [
"# 检查数据文件结构\n",
"import numpy as np\n",
"import os\n",
"\n",
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 检查第一个文件的键\n",
"test_file = val_files[0]\n",
"file_path = os.path.join(data_dir, test_file)\n",
"\n",
"print(f\"检查文件: {test_file}\")\n",
"data = np.load(file_path)\n",
"print(f\"文件中的键: {list(data.keys())}\")\n",
"\n",
"# 显示数据形状\n",
"for key in data.keys():\n",
" print(f\" {key}: {data[key].shape if hasattr(data[key], 'shape') else type(data[key])}\")\n",
"\n",
"data.close()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"检查文件: t15.2023.08.13_val_concatenated.npz\n",
"文件中的键: ['neural_logits_concatenated', 'confidence_scores', 'pred_seq', 'block_num', 'trial_num', 'session', 'sentence_label', 'seq_class_ids', 'seq_len']\n",
" neural_logits_concatenated: (35,)\n",
" seq_class_ids: (35, 500)\n",
" seq_class_ids 前10个值: [[37 34 40 ... 0 0 0]\n",
" [16 5 40 ... 0 0 0]\n",
" [23 1 31 ... 0 0 0]\n",
" ...\n",
" [10 17 29 ... 0 0 0]\n",
" [16 2 38 ... 0 0 0]\n",
" [36 33 9 ... 0 0 0]]\n"
]
}
],
"source": [
"# 检查数据文件结构(修复版)\n",
"import numpy as np\n",
"import os\n",
"\n",
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 检查第一个文件的键\n",
"test_file = val_files[0]\n",
"file_path = os.path.join(data_dir, test_file)\n",
"\n",
"print(f\"检查文件: {test_file}\")\n",
"data = np.load(file_path, allow_pickle=True)\n",
"print(f\"文件中的键: {list(data.keys())}\")\n",
"\n",
"# 检查neural_logits_concatenated和seq_class_ids的形状\n",
"if 'neural_logits_concatenated' in data:\n",
" neural_logits = data['neural_logits_concatenated']\n",
" print(f\" neural_logits_concatenated: {neural_logits.shape}\")\n",
"\n",
"if 'seq_class_ids' in data:\n",
" seq_class_ids = data['seq_class_ids']\n",
" print(f\" seq_class_ids: {seq_class_ids.shape}\")\n",
" print(f\" seq_class_ids 前10个值: {seq_class_ids[:10]}\")\n",
"\n",
"data.close()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔍 LightGBM参数搜索最终版...\n",
"================================================================================\n",
"📊 参数搜索配置:\n",
" 交叉验证折数: 3\n",
" 最大参数组合: 6\n",
" 最大样本数: 1000\n",
" 参数空间大小: 8 种组合\n",
"\n",
"📦 准备少量数据集...\n",
" 使用文件: t15.2023.08.13_val_concatenated.npz\n",
" 总样本数量: 962\n",
" 特征维度: 7209\n",
" 二分类标签分布: [732 230]\n",
"\n",
"🎯 生成参数组合...\n",
" 实际测试组合数: 6\n",
"\n",
"🚀 开始参数搜索...\n",
"\n",
"🔧 测试组合 1/6:\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 6}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.553502\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.553875\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's binary_logloss: 0.549469\n",
" 性能: 0.7609 ± 0.0011\n",
" ✨ 新的最佳参数!\n",
"\n",
"🔧 测试组合 2/6:\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 8}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.554291\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.55475\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.55238\n",
" 性能: 0.7609 ± 0.0011\n",
"\n",
"🔧 测试组合 3/6:\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 6}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.553502\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.553875\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's binary_logloss: 0.549469\n",
" 性能: 0.7609 ± 0.0011\n",
"\n",
"🔧 测试组合 4/6:\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 8}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.554291\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.55475\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.55238\n",
" 性能: 0.7609 ± 0.0011\n",
"\n",
"🔧 测试组合 5/6:\n",
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 6}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.556997\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.557785\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's binary_logloss: 0.551978\n",
" 性能: 0.7609 ± 0.0011\n",
"\n",
"🔧 测试组合 6/6:\n",
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8}\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.55862\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's binary_logloss: 0.559016\n",
"Training until validation scores don't improve for 5 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's binary_logloss: 0.557107\n",
" 性能: 0.7609 ± 0.0011\n",
"\n",
"🏆 参数搜索完成!\n",
"================================================================================\n",
"🎯 最佳参数组合:\n",
" learning_rate: 0.05\n",
" num_leaves: 31\n",
" max_depth: 6\n",
"\n",
"📈 最佳性能: 0.7609 ± 0.0011\n",
"\n",
"💾 参数搜索结果已保存:\n",
" 最佳参数变量: BEST_PARAMS_FINAL\n",
" 所有结果变量: PARAM_SEARCH_RESULTS_FINAL\n",
" 共测试了 6 种参数组合\n",
"\n",
"🔝 所有参数组合结果:\n",
" 1. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 6}\n",
" 2. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': 8}\n",
" 3. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 6}\n",
" 4. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.05, 'num_leaves': 63, 'max_depth': 8}\n",
" 5. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 6}\n",
" 6. 分数: 0.7609 ± 0.0011\n",
" 参数: {'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8}\n",
"\n",
"✅ 参数搜索完成!可以在后续训练中使用这些优化参数\n"
]
}
],
"source": [
"print(\"🔍 LightGBM参数搜索最终版...\")\n",
"print(\"=\" * 80)\n",
"\n",
"import itertools\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"import os\n",
"\n",
"# 参数搜索配置\n",
"PARAM_SEARCH_CONFIG = {\n",
" 'cv_folds': 3, # 3折交叉验证\n",
" 'max_combinations': 6, # 减少到6种参数组合\n",
" 'max_samples': 1000 # 最多使用1000个样本\n",
"}\n",
"\n",
"# 定义参数搜索空间(精简版)\n",
"param_grid = {\n",
" 'learning_rate': [0.05, 0.1],\n",
" 'num_leaves': [31, 63],\n",
" 'max_depth': [6, 8],\n",
"}\n",
"\n",
"print(f\"📊 参数搜索配置:\")\n",
"print(f\" 交叉验证折数: {PARAM_SEARCH_CONFIG['cv_folds']}\")\n",
"print(f\" 最大参数组合: {PARAM_SEARCH_CONFIG['max_combinations']}\")\n",
"print(f\" 最大样本数: {PARAM_SEARCH_CONFIG['max_samples']}\")\n",
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
"\n",
"# 获取少量数据用于参数搜索\n",
"print(f\"\\n📦 准备少量数据集...\")\n",
"\n",
"# 使用绝对路径\n",
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 只使用第一个文件进行快速参数搜索\n",
"test_file = val_files[0]\n",
"file_path = os.path.join(data_dir, test_file)\n",
"\n",
"print(f\" 使用文件: {test_file}\")\n",
"\n",
"# 加载数据\n",
"data = np.load(file_path, allow_pickle=True)\n",
"neural_logits = data['neural_logits_concatenated']\n",
"seq_class_ids = data['seq_class_ids']\n",
"\n",
"# 处理数据\n",
"all_features = []\n",
"all_labels = []\n",
"\n",
"for i, logits in enumerate(neural_logits):\n",
" if logits is not None and hasattr(logits, 'shape') and len(logits.shape) > 0:\n",
" # 获取对应的标签序列\n",
" labels = seq_class_ids[i]\n",
" \n",
" # 只取前面的一部分数据\n",
" max_len = min(len(logits), len(labels), 50) # 最多50个时间步\n",
" \n",
" for j in range(max_len):\n",
" if labels[j] != 0: # 跳过padding标签\n",
" all_features.append(logits[j].flatten())\n",
" all_labels.append(labels[j])\n",
"\n",
"data.close()\n",
"\n",
"if len(all_features) == 0:\n",
" raise ValueError(\"没有找到有效的特征数据\")\n",
"\n",
"# 转换为numpy数组\n",
"sample_X = np.array(all_features)\n",
"sample_y = np.array(all_labels)\n",
"\n",
"# 限制样本数量\n",
"if len(sample_X) > PARAM_SEARCH_CONFIG['max_samples']:\n",
" indices = np.random.choice(len(sample_X), PARAM_SEARCH_CONFIG['max_samples'], replace=False)\n",
" sample_X = sample_X[indices]\n",
" sample_y = sample_y[indices]\n",
"\n",
"# 应用PCA变换如果已经拟合\n",
"if GLOBAL_PCA['is_fitted']:\n",
" sample_X = apply_pca_transform(sample_X)\n",
"\n",
"print(f\" 总样本数量: {len(sample_X)}\")\n",
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
"\n",
"# 简化为二分类问题标签40 vs 其他\n",
"sample_y_binary = (sample_y == 40).astype(int)\n",
"print(f\" 二分类标签分布: {np.bincount(sample_y_binary)}\")\n",
"\n",
"# 生成参数组合\n",
"print(f\"\\n🎯 生成参数组合...\")\n",
"param_names = list(param_grid.keys())\n",
"param_values = list(param_grid.values())\n",
"\n",
"# 获取所有参数组合\n",
"all_combinations = list(itertools.product(*param_values))\n",
"selected_combinations = all_combinations[:PARAM_SEARCH_CONFIG['max_combinations']]\n",
"\n",
"print(f\" 实际测试组合数: {len(selected_combinations)}\")\n",
"\n",
"# 参数搜索函数\n",
"def evaluate_params(params_dict, X, y, cv_folds=3):\n",
" \"\"\"评估参数组合的性能\"\"\"\n",
" try:\n",
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
" scores = []\n",
" \n",
" for train_idx, val_idx in skf.split(X, y):\n",
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
" \n",
" # 创建LightGBM数据集\n",
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
" \n",
" # 训练模型\n",
" model = lgb.train(\n",
" params_dict,\n",
" train_data,\n",
" valid_sets=[val_data],\n",
" num_boost_round=20, # 少量轮数快速评估\n",
" callbacks=[lgb.early_stopping(5), lgb.log_evaluation(0)]\n",
" )\n",
" \n",
" # 预测和评估\n",
" y_pred = model.predict(X_val_fold)\n",
" y_pred_class = (y_pred > 0.5).astype(int)\n",
" \n",
" score = accuracy_score(y_val_fold, y_pred_class)\n",
" scores.append(score)\n",
" \n",
" return np.mean(scores), np.std(scores)\n",
" except Exception as e:\n",
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
" return 0.0, 1.0\n",
"\n",
"# 开始参数搜索\n",
"print(f\"\\n🚀 开始参数搜索...\")\n",
"best_score = 0\n",
"best_params = None\n",
"best_std = 1.0\n",
"results = []\n",
"\n",
"for i, combination in enumerate(selected_combinations):\n",
" params_dict = dict(zip(param_names, combination))\n",
" \n",
" # 添加固定参数\n",
" params_dict.update({\n",
" 'objective': 'binary',\n",
" 'metric': 'binary_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42,\n",
" 'feature_fraction': 0.8,\n",
" 'bagging_fraction': 0.8,\n",
" 'min_data_in_leaf': 20\n",
" })\n",
" \n",
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
" key_params = {k: v for k, v in params_dict.items() \n",
" if k in param_names}\n",
" print(f\" 参数: {key_params}\")\n",
" \n",
" # 评估参数\n",
" mean_score, std_score = evaluate_params(params_dict, sample_X, sample_y_binary, PARAM_SEARCH_CONFIG['cv_folds'])\n",
" \n",
" results.append({\n",
" 'params': params_dict.copy(),\n",
" 'mean_score': mean_score,\n",
" 'std_score': std_score\n",
" })\n",
" \n",
" print(f\" 性能: {mean_score:.4f} ± {std_score:.4f}\")\n",
" \n",
" # 更新最佳参数\n",
" if mean_score > best_score:\n",
" best_score = mean_score\n",
" best_params = params_dict.copy()\n",
" best_std = std_score\n",
" print(f\" ✨ 新的最佳参数!\")\n",
"\n",
"print(f\"\\n🏆 参数搜索完成!\")\n",
"print(f\"=\" * 80)\n",
"print(f\"🎯 最佳参数组合:\")\n",
"for key, value in best_params.items():\n",
" if key in param_names:\n",
" print(f\" {key}: {value}\")\n",
"\n",
"print(f\"\\n📈 最佳性能: {best_score:.4f} ± {best_std:.4f}\")\n",
"\n",
"# 保存结果\n",
"BEST_PARAMS_FINAL = best_params\n",
"PARAM_SEARCH_RESULTS_FINAL = results\n",
"\n",
"print(f\"\\n💾 参数搜索结果已保存:\")\n",
"print(f\" 最佳参数变量: BEST_PARAMS_FINAL\")\n",
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS_FINAL\")\n",
"print(f\" 共测试了 {len(results)} 种参数组合\")\n",
"\n",
"# 显示所有结果\n",
"print(f\"\\n🔝 所有参数组合结果:\")\n",
"sorted_results = sorted(results, key=lambda x: x['mean_score'], reverse=True)\n",
"for i, result in enumerate(sorted_results):\n",
" key_params = {k: v for k, v in result['params'].items() \n",
" if k in param_names}\n",
" print(f\" {i+1}. 分数: {result['mean_score']:.4f} ± {result['std_score']:.4f}\")\n",
" print(f\" 参数: {key_params}\")\n",
"\n",
"print(f\"\\n✅ 参数搜索完成!可以在后续训练中使用这些优化参数\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔍 改进版参数搜索 - 扩大搜索空间和复杂度...\n",
"================================================================================\n",
"📊 改进的参数搜索配置:\n",
" 交叉验证折数: 5\n",
" 最大参数组合: 20\n",
" 最大样本数: 2000\n",
" 训练轮数: 100\n",
" 参数空间大小: 72000 种组合\n",
"\n",
"📦 准备更大的数据集...\n",
" 使用文件数: 3 / 41\n",
" 处理文件: t15.2023.08.13_val_concatenated.npz\n",
" 累计样本数: 962\n",
" 处理文件: t15.2023.08.18_val_concatenated.npz\n",
" 累计样本数: 2155\n",
" 处理文件: t15.2023.08.20_val_concatenated.npz\n",
" 累计样本数: 3414\n",
" 最终样本数量: 2000\n",
" 特征维度: 7209\n",
" 有效标签: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(6), np.int64(7), np.int64(9), np.int64(10), np.int64(11), np.int64(13)]\n",
" 过滤后样本数量: 492\n",
" 类别分布: [ 34 49 147 21 33 25 60 50 48 25]\n",
"\n",
"🎯 生成随机参数组合...\n",
" 生成了 20 个随机参数组合\n",
"\n",
"🚀 开始改进的参数搜索...\n",
"\n",
"🔧 测试组合 1/20:\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.11651\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.14504\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.14196\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.2164\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.16782\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 准确率: 0.2825 ± 0.0299\n",
" F1分数: 0.0830 ± 0.0224\n",
" ✨ 新的最佳参数F1分数: 0.0830\n",
"\n",
"🔧 测试组合 2/20:\n",
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(63), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.08376\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.09276\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.10016\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.12363\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.07434\n",
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0460 ± 0.0004\n",
"\n",
"🔧 测试组合 3/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.12531\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.2417\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.13152\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.20646\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.07364\n",
" 准确率: 0.2623 ± 0.0388\n",
" F1分数: 0.1037 ± 0.0407\n",
" ✨ 新的最佳参数F1分数: 0.1037\n",
"\n",
"🔧 测试组合 4/20:\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(31), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.099\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.09604\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.09527\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.1556\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.15148\n",
" 准确率: 0.2947 ± 0.0131\n",
" F1分数: 0.0580 ± 0.0173\n",
"\n",
"🔧 测试组合 5/20:\n",
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(127), 'max_depth': np.int64(12), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.09017\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.09111\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[4]\tvalid_0's multi_logloss: 2.09994\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.12214\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.10879\n",
" 准确率: 0.2967 ± 0.0034\n",
" F1分数: 0.0458 ± 0.0004\n",
"\n",
"🔧 测试组合 6/20:\n",
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(127), 'max_depth': np.int64(12), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.081\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.09223\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.08461\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.10818\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.11166\n",
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0461 ± 0.0004\n",
"\n",
"🔧 测试组合 7/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(15), 'max_depth': np.int64(6), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.08483\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08921\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08247\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.13921\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.08953\n",
" 准确率: 0.2967 ± 0.0123\n",
" F1分数: 0.0530 ± 0.0152\n",
"\n",
"🔧 测试组合 8/20:\n",
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(15), 'max_depth': np.int64(6), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[4]\tvalid_0's multi_logloss: 2.08538\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[4]\tvalid_0's multi_logloss: 2.08843\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[14]\tvalid_0's multi_logloss: 2.09424\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.11347\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[16]\tvalid_0's multi_logloss: 2.09417\n",
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0460 ± 0.0004\n",
"\n",
"🔧 测试组合 9/20:\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.0937\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.09824\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08626\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.12937\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.07891\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 准确率: 0.3028 ± 0.0065\n",
" F1分数: 0.0588 ± 0.0162\n",
"\n",
"🔧 测试组合 10/20:\n",
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(127), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n",
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[7]\tvalid_0's multi_logloss: 2.08931\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[6]\tvalid_0's multi_logloss: 2.0929\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[11]\tvalid_0's multi_logloss: 2.08181\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.10389\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.09144\n",
" 准确率: 0.2968 ± 0.0049\n",
" F1分数: 0.0510 ± 0.0106\n",
"\n",
"🔧 测试组合 11/20:\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(127), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.12579\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.16556\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.12374\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.17234\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.15226\n",
" 准确率: 0.2825 ± 0.0092\n",
" F1分数: 0.0699 ± 0.0227\n",
"\n",
"🔧 测试组合 12/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(3), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.07078\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.09673\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08357\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.14388\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.08769\n",
" 准确率: 0.2886 ± 0.0141\n",
" F1分数: 0.0496 ± 0.0075\n",
"\n",
"🔧 测试组合 13/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(255), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.13601\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.08359\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.06893\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.15841\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.1325\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 准确率: 0.2967 ± 0.0096\n",
" F1分数: 0.0585 ± 0.0161\n",
"\n",
"🔧 测试组合 14/20:\n",
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(63), 'max_depth': np.int64(3), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n",
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.08199\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.09063\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.08862\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[2]\tvalid_0's multi_logloss: 2.103\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.11438\n",
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0460 ± 0.0004\n",
"\n",
"🔧 测试组合 15/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(6), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.10926\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08871\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08299\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.14121\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.10462\n",
" 准确率: 0.2906 ± 0.0242\n",
" F1分数: 0.0563 ± 0.0152\n",
"\n",
"🔧 测试组合 16/20:\n",
" 参数: {'learning_rate': np.float64(0.01), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[13]\tvalid_0's multi_logloss: 2.09256\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[15]\tvalid_0's multi_logloss: 2.09021\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[18]\tvalid_0's multi_logloss: 2.09268\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.11506\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.11271\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0460 ± 0.0004\n",
"\n",
"🔧 测试组合 17/20:\n",
" 参数: {'learning_rate': np.float64(0.01), 'num_leaves': np.int64(63), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(20), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n",
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[26]\tvalid_0's multi_logloss: 2.08426\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[23]\tvalid_0's multi_logloss: 2.08892\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[23]\tvalid_0's multi_logloss: 2.08765\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.11129\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[14]\tvalid_0's multi_logloss: 2.09748\n",
" 准确率: 0.2988 ± 0.0035\n",
" F1分数: 0.0460 ± 0.0004\n",
"\n",
"🔧 测试组合 18/20:\n",
" 参数: {'learning_rate': np.float64(0.05), 'num_leaves': np.int64(63), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.9), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[6]\tvalid_0's multi_logloss: 2.08507\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.10854\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.07903\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.11723\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.08765\n",
" 准确率: 0.3028 ± 0.0106\n",
" F1分数: 0.0549 ± 0.0180\n",
"\n",
"🔧 测试组合 19/20:\n",
" 参数: {'learning_rate': np.float64(0.1), 'num_leaves': np.int64(15), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\tvalid_0's multi_logloss: 2.08528\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.09118\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.10493\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.11337\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[4]\tvalid_0's multi_logloss: 2.0784\n",
" 准确率: 0.2967 ± 0.0034\n",
" F1分数: 0.0458 ± 0.0003\n",
"\n",
"🔧 测试组合 20/20:\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(63), 'max_depth': np.int64(-1), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(1.0)}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.06234\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.10061\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[4]\tvalid_0's multi_logloss: 2.10826\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[1]\tvalid_0's multi_logloss: 2.13882\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training until validation scores don't improve for 10 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\tvalid_0's multi_logloss: 2.07953\n",
" 准确率: 0.3008 ± 0.0041\n",
" F1分数: 0.0580 ± 0.0169\n",
"\n",
"🏆 改进的参数搜索完成!\n",
"================================================================================\n",
"🎯 最佳参数组合:\n",
" learning_rate: 0.2\n",
" num_leaves: 31\n",
" max_depth: 12\n",
" feature_fraction: 0.8\n",
" bagging_fraction: 0.8\n",
" min_data_in_leaf: 10\n",
" lambda_l1: 0.0\n",
" lambda_l2: 0.0\n",
"\n",
"📈 最佳性能:\n",
" accuracy: 0.2623 ± 0.0388\n",
" f1_macro: 0.1037 ± 0.0407\n",
" precision_macro: 0.1395 ± 0.0853\n",
" recall_macro: 0.1224 ± 0.0320\n",
"\n",
"💾 改进的参数搜索结果已保存:\n",
" 最佳参数变量: BEST_PARAMS_IMPROVED\n",
" 所有结果变量: PARAM_SEARCH_RESULTS_IMPROVED\n",
"\n",
"🔝 Top 5 参数组合 (按F1分数排序):\n",
" 1. F1: 0.1037, 准确率: 0.2623\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(31), 'max_depth': np.int64(12), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.8), 'min_data_in_leaf': np.int64(10), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.0)}\n",
" 2. F1: 0.0830, 准确率: 0.2825\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(1.0), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(1.0), 'lambda_l2': np.float64(0.1)}\n",
" 3. F1: 0.0699, 准确率: 0.2825\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(127), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(1.0), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(0.1)}\n",
" 4. F1: 0.0588, 准确率: 0.3028\n",
" 参数: {'learning_rate': np.float64(0.15), 'num_leaves': np.int64(255), 'max_depth': np.int64(9), 'feature_fraction': np.float64(0.8), 'bagging_fraction': np.float64(0.6), 'min_data_in_leaf': np.int64(50), 'lambda_l1': np.float64(0.1), 'lambda_l2': np.float64(0.1)}\n",
" 5. F1: 0.0585, 准确率: 0.2967\n",
" 参数: {'learning_rate': np.float64(0.2), 'num_leaves': np.int64(255), 'max_depth': np.int64(3), 'feature_fraction': np.float64(0.6), 'bagging_fraction': np.float64(0.9), 'min_data_in_leaf': np.int64(5), 'lambda_l1': np.float64(0.0), 'lambda_l2': np.float64(1.0)}\n",
"\n",
"✅ 改进的参数搜索完成!现在应该能看到不同参数组合的性能差异\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\SoftWare\\Anaconda3\\envs\\b2txt25\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"print(\"🔍 改进版参数搜索 - 扩大搜索空间和复杂度...\")\n",
"print(\"=\" * 80)\n",
"\n",
"import itertools\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"import os\n",
"\n",
"# 改进的参数搜索配置\n",
"IMPROVED_PARAM_CONFIG = {\n",
" 'cv_folds': 5, # 增加到5折交叉验证\n",
" 'max_combinations': 20, # 增加参数组合数\n",
" 'max_samples': 2000, # 增加样本数\n",
" 'num_boost_round': 100, # 增加训练轮数\n",
" 'early_stopping_rounds': 10\n",
"}\n",
"\n",
"# 扩大参数搜索空间\n",
"param_grid = {\n",
" 'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2], # 更大范围的学习率\n",
" 'num_leaves': [15, 31, 63, 127, 255], # 更大范围的叶子数\n",
" 'max_depth': [3, 6, 9, 12, -1], # 包括无限深度\n",
" 'feature_fraction': [0.6, 0.8, 0.9, 1.0], # 特征采样比例\n",
" 'bagging_fraction': [0.6, 0.8, 0.9, 1.0], # 数据采样比例\n",
" 'min_data_in_leaf': [5, 10, 20, 50], # 叶子节点最小样本数\n",
" 'lambda_l1': [0, 0.1, 1.0], # L1正则化\n",
" 'lambda_l2': [0, 0.1, 1.0], # L2正则化\n",
"}\n",
"\n",
"print(f\"📊 改进的参数搜索配置:\")\n",
"print(f\" 交叉验证折数: {IMPROVED_PARAM_CONFIG['cv_folds']}\")\n",
"print(f\" 最大参数组合: {IMPROVED_PARAM_CONFIG['max_combinations']}\")\n",
"print(f\" 最大样本数: {IMPROVED_PARAM_CONFIG['max_samples']}\")\n",
"print(f\" 训练轮数: {IMPROVED_PARAM_CONFIG['num_boost_round']}\")\n",
"print(f\" 参数空间大小: {np.prod([len(v) for v in param_grid.values()])} 种组合\")\n",
"\n",
"# 获取更多数据\n",
"print(f\"\\n📦 准备更大的数据集...\")\n",
"\n",
"data_dir = r'f:\\BRAIN-TO-TEXT\\nejm-brain-to-text\\data\\concatenated_data'\n",
"val_files = [f for f in os.listdir(data_dir) if f.endswith('_val_concatenated.npz')]\n",
"\n",
"# 使用前3个文件\n",
"sample_files = val_files[:3]\n",
"print(f\" 使用文件数: {len(sample_files)} / {len(val_files)}\")\n",
"\n",
"# 加载更多数据\n",
"all_features = []\n",
"all_labels = []\n",
"\n",
"for file in sample_files:\n",
" file_path = os.path.join(data_dir, file)\n",
" try:\n",
" data = np.load(file_path, allow_pickle=True)\n",
" neural_logits = data['neural_logits_concatenated']\n",
" seq_class_ids = data['seq_class_ids']\n",
" \n",
" print(f\" 处理文件: {file}\")\n",
" \n",
" for i, logits in enumerate(neural_logits):\n",
" if logits is not None and hasattr(logits, 'shape') and len(logits.shape) > 0:\n",
" labels = seq_class_ids[i]\n",
" \n",
" # 取更多的时间步\n",
" max_len = min(len(logits), len(labels), 100) # 增加到100个时间步\n",
" \n",
" for j in range(max_len):\n",
" if labels[j] != 0: # 跳过padding标签\n",
" all_features.append(logits[j].flatten())\n",
" all_labels.append(labels[j])\n",
" \n",
" data.close()\n",
" print(f\" 累计样本数: {len(all_features)}\")\n",
" \n",
" except Exception as e:\n",
" print(f\" ⚠️ 加载失败: {file} - {e}\")\n",
"\n",
"if len(all_features) == 0:\n",
" raise ValueError(\"没有找到有效的特征数据\")\n",
"\n",
"# 转换为numpy数组\n",
"sample_X = np.array(all_features)\n",
"sample_y = np.array(all_labels)\n",
"\n",
"# 随机采样\n",
"if len(sample_X) > IMPROVED_PARAM_CONFIG['max_samples']:\n",
" indices = np.random.choice(len(sample_X), IMPROVED_PARAM_CONFIG['max_samples'], replace=False)\n",
" sample_X = sample_X[indices]\n",
" sample_y = sample_y[indices]\n",
"\n",
"# 应用PCA变换\n",
"if GLOBAL_PCA['is_fitted']:\n",
" sample_X = apply_pca_transform(sample_X)\n",
"\n",
"print(f\" 最终样本数量: {len(sample_X)}\")\n",
"print(f\" 特征维度: {sample_X.shape[1]}\")\n",
"\n",
"# 创建多分类任务保留前10个类别\n",
"valid_labels = []\n",
"for label in np.unique(sample_y):\n",
" if np.sum(sample_y == label) >= 20: # 每个类别至少20个样本\n",
" valid_labels.append(label)\n",
"\n",
"valid_labels = sorted(valid_labels)[:10] # 取前10个有效标签\n",
"print(f\" 有效标签: {valid_labels}\")\n",
"\n",
"# 过滤数据,只保留有效标签\n",
"mask = np.isin(sample_y, valid_labels)\n",
"sample_X = sample_X[mask]\n",
"sample_y = sample_y[mask]\n",
"\n",
"# 重新映射标签到0-9\n",
"label_mapping = {old_label: new_label for new_label, old_label in enumerate(valid_labels)}\n",
"sample_y_mapped = np.array([label_mapping[label] for label in sample_y])\n",
"\n",
"print(f\" 过滤后样本数量: {len(sample_X)}\")\n",
"print(f\" 类别分布: {np.bincount(sample_y_mapped)}\")\n",
"\n",
"# 随机参数搜索\n",
"print(f\"\\n🎯 生成随机参数组合...\")\n",
"np.random.seed(42)\n",
"\n",
"def random_params():\n",
" \"\"\"生成随机参数组合\"\"\"\n",
" return {\n",
" 'learning_rate': np.random.choice(param_grid['learning_rate']),\n",
" 'num_leaves': np.random.choice(param_grid['num_leaves']),\n",
" 'max_depth': np.random.choice(param_grid['max_depth']),\n",
" 'feature_fraction': np.random.choice(param_grid['feature_fraction']),\n",
" 'bagging_fraction': np.random.choice(param_grid['bagging_fraction']),\n",
" 'min_data_in_leaf': np.random.choice(param_grid['min_data_in_leaf']),\n",
" 'lambda_l1': np.random.choice(param_grid['lambda_l1']),\n",
" 'lambda_l2': np.random.choice(param_grid['lambda_l2'])\n",
" }\n",
"\n",
"selected_combinations = [random_params() for _ in range(IMPROVED_PARAM_CONFIG['max_combinations'])]\n",
"print(f\" 生成了 {len(selected_combinations)} 个随机参数组合\")\n",
"\n",
"# 改进的参数评估函数\n",
"def evaluate_params_improved(params_dict, X, y, cv_folds=5):\n",
" \"\"\"改进的参数评估函数,返回多个指标\"\"\"\n",
" try:\n",
" skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)\n",
" scores = {\n",
" 'accuracy': [],\n",
" 'f1_macro': [],\n",
" 'precision_macro': [],\n",
" 'recall_macro': []\n",
" }\n",
" \n",
" for train_idx, val_idx in skf.split(X, y):\n",
" X_train_fold, X_val_fold = X[train_idx], X[val_idx]\n",
" y_train_fold, y_val_fold = y[train_idx], y[val_idx]\n",
" \n",
" # 创建LightGBM数据集\n",
" train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n",
" val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)\n",
" \n",
" # 训练模型\n",
" model = lgb.train(\n",
" params_dict,\n",
" train_data,\n",
" valid_sets=[val_data],\n",
" num_boost_round=IMPROVED_PARAM_CONFIG['num_boost_round'],\n",
" callbacks=[\n",
" lgb.early_stopping(IMPROVED_PARAM_CONFIG['early_stopping_rounds']), \n",
" lgb.log_evaluation(0)\n",
" ]\n",
" )\n",
" \n",
" # 预测和评估\n",
" y_pred = model.predict(X_val_fold)\n",
" if len(np.unique(y)) > 2: # 多分类\n",
" y_pred_class = np.argmax(y_pred, axis=1)\n",
" else: # 二分类\n",
" y_pred_class = (y_pred > 0.5).astype(int)\n",
" \n",
" # 计算多个指标\n",
" scores['accuracy'].append(accuracy_score(y_val_fold, y_pred_class))\n",
" scores['f1_macro'].append(f1_score(y_val_fold, y_pred_class, average='macro'))\n",
" scores['precision_macro'].append(precision_score(y_val_fold, y_pred_class, average='macro'))\n",
" scores['recall_macro'].append(recall_score(y_val_fold, y_pred_class, average='macro'))\n",
" \n",
" # 计算平均值和标准差\n",
" result = {}\n",
" for metric, values in scores.items():\n",
" result[metric] = {\n",
" 'mean': np.mean(values),\n",
" 'std': np.std(values)\n",
" }\n",
" \n",
" return result\n",
" \n",
" except Exception as e:\n",
" print(f\" ⚠️ 参数组合评估失败: {e}\")\n",
" return {\n",
" 'accuracy': {'mean': 0.0, 'std': 1.0},\n",
" 'f1_macro': {'mean': 0.0, 'std': 1.0},\n",
" 'precision_macro': {'mean': 0.0, 'std': 1.0},\n",
" 'recall_macro': {'mean': 0.0, 'std': 1.0}\n",
" }\n",
"\n",
"# 开始改进的参数搜索\n",
"print(f\"\\n🚀 开始改进的参数搜索...\")\n",
"best_score = 0\n",
"best_params = None\n",
"results = []\n",
"\n",
"for i, params in enumerate(selected_combinations):\n",
" # 添加固定参数\n",
" params_dict = params.copy()\n",
" params_dict.update({\n",
" 'objective': 'multiclass',\n",
" 'num_class': len(np.unique(sample_y_mapped)),\n",
" 'metric': 'multi_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42\n",
" })\n",
" \n",
" print(f\"\\n🔧 测试组合 {i+1}/{len(selected_combinations)}:\")\n",
" key_params = {k: v for k, v in params.items()}\n",
" print(f\" 参数: {key_params}\")\n",
" \n",
" # 评估参数\n",
" metrics = evaluate_params_improved(params_dict, sample_X, sample_y_mapped, IMPROVED_PARAM_CONFIG['cv_folds'])\n",
" \n",
" result = {\n",
" 'params': params.copy(),\n",
" 'metrics': metrics\n",
" }\n",
" results.append(result)\n",
" \n",
" accuracy = metrics['accuracy']['mean']\n",
" f1 = metrics['f1_macro']['mean']\n",
" \n",
" print(f\" 准确率: {accuracy:.4f} ± {metrics['accuracy']['std']:.4f}\")\n",
" print(f\" F1分数: {f1:.4f} ± {metrics['f1_macro']['std']:.4f}\")\n",
" \n",
" # 使用F1分数作为主要评估指标\n",
" if f1 > best_score:\n",
" best_score = f1\n",
" best_params = params.copy()\n",
" print(f\" ✨ 新的最佳参数F1分数: {f1:.4f}\")\n",
"\n",
"print(f\"\\n🏆 改进的参数搜索完成!\")\n",
"print(f\"=\" * 80)\n",
"print(f\"🎯 最佳参数组合:\")\n",
"for key, value in best_params.items():\n",
" print(f\" {key}: {value}\")\n",
"\n",
"print(f\"\\n📈 最佳性能:\")\n",
"best_result = max(results, key=lambda x: x['metrics']['f1_macro']['mean'])\n",
"for metric, values in best_result['metrics'].items():\n",
" print(f\" {metric}: {values['mean']:.4f} ± {values['std']:.4f}\")\n",
"\n",
"# 保存结果\n",
"BEST_PARAMS_IMPROVED = best_params\n",
"PARAM_SEARCH_RESULTS_IMPROVED = results\n",
"\n",
"print(f\"\\n💾 改进的参数搜索结果已保存:\")\n",
"print(f\" 最佳参数变量: BEST_PARAMS_IMPROVED\")\n",
"print(f\" 所有结果变量: PARAM_SEARCH_RESULTS_IMPROVED\")\n",
"\n",
"# 显示前5个最佳结果\n",
"print(f\"\\n🔝 Top 5 参数组合 (按F1分数排序):\")\n",
"sorted_results = sorted(results, key=lambda x: x['metrics']['f1_macro']['mean'], reverse=True)\n",
"for i, result in enumerate(sorted_results[:5]):\n",
" f1 = result['metrics']['f1_macro']['mean']\n",
" accuracy = result['metrics']['accuracy']['mean']\n",
" print(f\" {i+1}. F1: {f1:.4f}, 准确率: {accuracy:.4f}\")\n",
" print(f\" 参数: {result['params']}\")\n",
"\n",
"print(f\"\\n✅ 改进的参数搜索完成!现在应该能看到不同参数组合的性能差异\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"OPTIMIZED_PARAMS = {\n",
" 'learning_rate': 0.2, # 较高学习率\n",
" 'num_leaves': 31, # 适中的叶子数\n",
" 'max_depth': 12, # 较深的树\n",
" 'feature_fraction': 0.8, # 特征采样\n",
" 'bagging_fraction': 0.8, # 数据采样\n",
" 'min_data_in_leaf': 10, # 较小的叶子节点样本数\n",
" 'lambda_l1': 0.0, # 无L1正则化\n",
" 'lambda_l2': 0.0, # 无L2正则化\n",
" 'objective': 'multiclass',\n",
" 'boosting_type': 'gbdt',\n",
" 'verbosity': -1,\n",
" 'seed': 42\n",
"}"
]
}
],
"metadata": {
"kaggle": {
"accelerator": "tpu1vmV38",
"dataSources": [
{
"databundleVersionId": 13056355,
"sourceId": 106809,
"sourceType": "competition"
}
],
"dockerImageVersionId": 31091,
"isGpuEnabled": false,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "b2txt25",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}