小参数

This commit is contained in:
Zchen
2025-10-16 09:22:25 +08:00
parent 25561a7615
commit df4a914bbd
2 changed files with 206 additions and 4 deletions

View File

@@ -2,16 +2,218 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "acb1482e",
"metadata": {},
"outputs": [],
"source": [
"# 我只想看看TPU占用情况"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b317eff3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[?1h\u001b=\u001b[H\u001b[2J\u001b[mtop - 17:43:08 up 29 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"Tasks:\u001b[m\u001b[m\u001b[1m 10 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 1 \u001b[m\u001b[mrunning,\u001b[m\u001b[m\u001b[1m 9 \u001b[m\u001b[msleeping,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mstopped,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mzombie\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 98.9 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.7 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292516.1 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61605.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35359.9 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325302.9 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"\u001b[K\n",
"\u001b[7m PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 310 root 20 0 175.4g 54.5g 371368 S 13.3 14.4 8:45.06 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 1 root 20 0 400128 98296 18312 S 0.0 0.0 0:08.66 jupyter+ \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 13 root 20 0 910476 61572 15620 S 0.0 0.0 0:10.47 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2209 root 20 0 755688 68480 15688 S 0.0 0.0 0:00.64 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n",
"\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.0 0.0 0:00.01 top \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:11 up 30 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"\n",
"%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.3 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.2 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292395.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61724.6 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35362.1 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325184.2 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"\u001b[K\n",
"\n",
"\u001b[m 310 root 20 0 175.5g 54.6g 371368 S 14.3 14.5 8:45.49 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.50 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.67 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 1 root 20 0 400128 98296 18312 S 0.7 0.0 0:08.68 jupyter+ \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n",
"\n",
"\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:14 up 30 min, 0 user, load average: 2.17, 2.21, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"\n",
"%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.1 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.3 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292220.6 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61896.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35364.2 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325011.8 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
"\u001b[K\n",
"\n",
"\u001b[m 310 root 20 0 175.6g 54.8g 371368 S 17.7 14.5 8:46.02 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.53 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.70 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 1 root 20 0 400128 98296 18312 S 0.3 0.0 0:08.69 jupyter+ \u001b[m\u001b[m\u001b[K\n",
"\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.3 0.0 0:00.02 top \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n",
"\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[?1l\u001b>\u001b[25;1H\n",
"\u001b[K"
]
}
],
"source": [
"!top"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1eee541b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root 309 0.0 0.0 2576 904 pts/0 Ss+ 17:24 0:00 /usr/bin/sh -c cd /kaggle/working/b2txt25/model_training_nnn_tpu && python train_model_tf.py --config_path rnn_args.yaml\n",
"root 2268 0.0 0.0 2576 940 pts/1 Ss+ 17:44 0:00 /usr/bin/sh -c ps aux | grep -i tpu\n",
"root 2270 0.0 0.0 3744 2024 pts/1 S+ 17:44 0:00 grep -i tpu\n"
]
}
],
"source": [
"!ps aux | grep -i tpu"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2f03ffe1",
"metadata": {},
"outputs": [],
"source": [
"!pgrep -fl \"python.*tensorflow\\|python.*train\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ffbc7471",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== TPU状态检查 ===\n",
"时间: 2025-10-15 17:46:08\n",
"❌ TPU检查失败: open(/dev/vfio/5): Device or resource busy: Device or resource busy; Couldn't open iommu group /dev/vfio/5\n",
"💻 CPU使用率: 0.4%\n",
"💾 内存使用: 18.9% (68GB/377GB)\n",
"🐍 Python进程: 6个\n",
" PID:13 CPU:0.0% MEM:0.0%\n",
" PID:30 CPU:0.0% MEM:0.0%\n",
" PID:164 CPU:0.0% MEM:0.0%\n",
"🧪 TPU连接测试...\n",
"❌ TPU测试失败: name 'tpu_devices' is not defined\n",
"=== 检查完成 ===\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"import psutil\n",
"import os\n",
"import time\n",
"\n",
"print(\"=== TPU状态检查 ===\")\n",
"print(f\"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n",
"\n",
"# TPU设备检查\n",
"try:\n",
" tpu_devices = tf.config.list_logical_devices('TPU')\n",
" print(f\"✅ TPU设备: {len(tpu_devices)}个\")\n",
" for i, device in enumerate(tpu_devices):\n",
" print(f\" TPU:{i} -> {device.name}\")\n",
"except Exception as e:\n",
" print(f\"❌ TPU检查失败: {e}\")\n",
"\n",
"# 系统资源\n",
"try:\n",
" cpu_percent = psutil.cpu_percent(interval=1)\n",
" memory = psutil.virtual_memory()\n",
" print(f\"💻 CPU使用率: {cpu_percent:.1f}%\")\n",
" print(f\"💾 内存使用: {memory.percent:.1f}% ({memory.used//1024//1024//1024}GB/{memory.total//1024//1024//1024}GB)\")\n",
"except Exception as e:\n",
" print(f\"❌ 系统资源检查失败: {e}\")\n",
"\n",
"# Python进程检查\n",
"try:\n",
" python_processes = []\n",
" for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):\n",
" if 'python' in proc.info['name'].lower():\n",
" python_processes.append(proc.info)\n",
" \n",
" print(f\"🐍 Python进程: {len(python_processes)}个\")\n",
" for proc in python_processes[:3]: # 只显示前3个\n",
" print(f\" PID:{proc['pid']} CPU:{proc['cpu_percent']:.1f}% MEM:{proc['memory_percent']:.1f}%\")\n",
"except Exception as e:\n",
" print(f\"❌ 进程检查失败: {e}\")\n",
"\n",
"# TPU简单测试\n",
"try:\n",
" print(\"🧪 TPU连接测试...\")\n",
" if tpu_devices:\n",
" with tf.device('/TPU:0'):\n",
" x = tf.constant([[1.0]])\n",
" result = tf.matmul(x, x)\n",
" print(f\"✅ TPU响应正常: {result.numpy()}\")\n",
" else:\n",
" print(\"⚠️ 没有TPU设备可测试\")\n",
"except Exception as e:\n",
" print(f\"❌ TPU测试失败: {e}\")\n",
"\n",
"print(\"=== 检查完成 ===\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e157ff0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,

View File

@@ -1,9 +1,9 @@
model:
n_input_features: 512 # number of input features in the neural data. (2 features per electrode, 256 electrodes)
n_units: 768 # number of units per GRU layer
n_units: 256 # number of units per GRU layer (大幅减少从768→256, 减少70%参数量)
rnn_dropout: 0.4 # dropout rate for the GRU layers
rnn_trainable: true # whether the GRU layers are trainable
n_layers: 5 # number of GRU layers
n_layers: 3 # number of GRU layers (从5层减少到3层)
patch_size: 14 # size of the input patches (14 time steps)
patch_stride: 4 # stride for the input patches (4 time steps)