1742 lines
361 KiB
Plaintext
1742 lines
361 KiB
Plaintext
![]() |
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"id": "0d316ehisw2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"=== 验证PyTorch unfold行为 ===\n",
|
|||
|
"原始张量形状: torch.Size([1, 1, 1, 30])\n",
|
|||
|
"原始张量内容: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0]\n",
|
|||
|
"Unfold后形状: torch.Size([1, 1, 1, 5, 14])\n",
|
|||
|
"输出时间步数: 5\n",
|
|||
|
"\n",
|
|||
|
"每个输出时间步对应的原始时间步:\n",
|
|||
|
"输出 0: 原始 [ 0:14] -> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]\n",
|
|||
|
"输出 1: 原始 [ 4:18] -> [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0]\n",
|
|||
|
"输出 2: 原始 [ 8:22] -> [8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0]\n",
|
|||
|
"输出 3: 原始 [12:26] -> [12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0]\n",
|
|||
|
"输出 4: 原始 [16:30] -> [16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0]\n",
|
|||
|
"\n",
|
|||
|
"=== 验证转换函数 ===\n",
|
|||
|
"测试案例:\n",
|
|||
|
"输出范围 中心位置 保守范围 可能范围 实际长度(ms) \n",
|
|||
|
"--------------------------------------------------------------------------------\n",
|
|||
|
"0-0 6.5-6.5 0-13 4-8 80\n",
|
|||
|
"1-1 10.5-10.5 4-17 8-12 80\n",
|
|||
|
"0-1 6.5-10.5 0-17 4-12 160\n",
|
|||
|
"206-207 830.5-834.5 824-841 828-836 160\n",
|
|||
|
"5-10 26.5-46.5 20-53 24-48 480\n",
|
|||
|
"\n",
|
|||
|
"=== 实际数据验证 ===\n",
|
|||
|
"转换结果统计 (基于 4000 个segments):\n",
|
|||
|
"输出时间步长度: 平均 1.6, 中位数 2.0\n",
|
|||
|
"简单映射长度: 平均 3.4, 中位数 5.0\n",
|
|||
|
"保守映射长度: 平均 16.4, 中位数 18.0\n",
|
|||
|
"可能映射长度: 平均 7.4, 中位数 9.0\n",
|
|||
|
"\n",
|
|||
|
"原始时间步 vs 输出时间步的比例:\n",
|
|||
|
"简单映射: 平均比例 1.8x\n",
|
|||
|
"保守映射: 平均比例 11.3x\n",
|
|||
|
"可能映射: 平均比例 4.7x\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPdCAYAAABlRyFLAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xl4TNf/B/D3bNlFEhFrSKp2tYegtlprCVG1lCIV+04pal8Su9ZeS4XWUrS+WlrUWkXVGktCiygimkVWSSaznN8f88ttxiSRkGQk8349zzzmnrudz2TMvfdzzz1HJoQQICIiIiIiIiIiIiIiE3JzV4CIiIiIiIiIiIiI6E3FJDoRERERERERERERURaYRCciIiIiIiIiIiIiygKT6EREREREREREREREWWASnYiIiIiIiIiIiIgoC0yiExERERERERERERFlgUl0IiIiIiIiIiIiIqIsMIlORERERERERERERJQFJtGJKM9otVrExcVBr9ebuypERESvRaPRmLsKRERERZJWqzV3FYoUnU4HIYS5q0FU5DGJThYhLi4OsbGxeP78OVJTU3P0SkhIQFJSEgAgLS0tx+ulv9LS0nJVx88//xxLly596XK3b9/G2LFjERYWluUy//vf/7Bt27Yc7ffOnTvYt29fjusZHh6OuLi4TOdduHABzs7O+Pvvv1+6ncjISDx79sykPDo6Gg8fPsxRXVJTU3Hs2DFER0ebzIuPj891Mv/p06d499138ccff+RqPQB4/Pix0b5Xr14NtVqd6+30798fEydOzPV6RESUe/v374eHh4fJ7/WxY8fg4eGBBw8eAADUajWSk5Nf+6I/NDQUO3bsyNU6ISEh+PPPPzOdFxsbm+s6fPnll/joo49yvV5GFy5cwIYNG3K8/M8//4xbt269dLm4uDgcO3YMycnJmc4XQuDJkyfS9P3797F169Yc1yOdRqNB/fr1ERQUlOt1iYgoZ65fv45hw4YhPDzcZN7ChQtRp04dxMfH53q7Pj4+2L9/f15UsUCdP38ea9asMTmXSE1NRUBAAKKiol66jfj4eNStWxc3btwwKv/mm29Qr1496bwlt2JiYuDj44OLFy++dNmNGzdi6dKl2SbtZ8+ejevXr2c5PyIiAseOHYNOpzOZ9yrnNidPnkTjxo2zPH8gyitMopNFGD16NFxcXODg4ABbW9scvYoXL45FixYBAFq1apXj9dJfPj4+0v6joqLw9OlT6RUZGWlSx+vXr2eZnM7o999/x5EjR+Du7p7lMqdPn8b333+fo89m/fr1+OSTT/D06dMcLT948GDUqVMH586dM5lnbW0NALCxsXnpdvbt24eKFSvi+fPnRuVHjhyBp6dnji624+Pj0a5dO1y9etWoXK/Xo0ePHujRo0emB+b0ZbRardHL1dUVT548waZNm4zKnz9/nu0JXmRkJKpWrYotW7YAMCRbPvvsM2zfvv2lMWQUFxeH77//HnZ2drlaj4iI/vP06VM8ePDA6JXZTVvAcJP8n3/+gUqlMipv1qwZSpUqhd69e0Oj0WDp0qWwt7eHSqWCTCYzeXXv3j1HdTt79iymT5+eq3hWr16NCRMmmJSfOHEClSpVwsmTJ7Nc98XjnFarhaOjI3bv3o0HDx5IZWq1GvHx8SYNAGxtbbFu3TqT7R47dgxz5szJcQwzZ87EqFGjcPLkSezZswf79u3Dvn37sHv3bty9e1da7tGjR2jXrl2Wx9yDBw/i7bffls4RQkJC4O/vjzt37uS4LgBw/PhxXL16FeXLl8/VekRElHM3btzAd999h5IlSxqVx8bG4osvvkBKSgqmTp2a7TZ+//13REREGJUFBwebJFrv37+f4+tZc5k5cyb279+PBw8e4O+//8b169eh0+lw8OBBo6S0EAIpKSmZbqN48eKoXr06hgwZYpTE/uqrr6BSqVChQgWTddLS0oxyEU+fPkViYqLJcj/99BOcnJxeGsf27duRkpICmUyW5TKrVq0yOr6/6LfffkO7du1MGr09ffoUNWvWxLJly7JcV6fTmZzbVKhQAX/++Sf27dsnlaWlpSExMdEk30D0OphEJ4vw5Zdf4smTJ4iJiUFsbKz0unv3LooVK4Zly5YZlcfGxiIyMhKTJ08GYEgOf/DBBwgLC8vRq0uXLihevLi0f29vb7i7u8PDwwPu7u6oUaMGAEOyNb3lOgAolUpp+sVWcREREZDJZBgyZAj++usvowv5F1tOOzo6wsrKKkefzfTp0yGEwMKFC3O0/I4dO1CtWjW0bt0aR48eRXJysvTIu0KhMPpXp9Ph+fPnmd6lDgkJQdu2bWFvb29UfuTIETRp0gQ1a9Z8aV3Sk/a2trZG5WPHjsXp06fRo0cPqS4v6tSpE1QqlckrLCwMX3/9tVGZg4MDOnTokGU9Vq5cCaVSKSVR3NzcMHr0aMycOTPTVvIAcO/ePdy5cwf379+XkjyrVq2CRqNBt27dTBJAYWFh+PvvvxEaGvrSz4WIyJJ99NFH8PT0NHrNmzcv02WVSiVkMhnk8v9OiZ89ewa9Xo+goCBUqlQJERERGDlyJO7fv4/w8HBERUUZvXx9fbM85h4+fBhKpVI6piuVSgDA3bt3pVdISEi2Lcesra1NjnN37tzBhx9+iMqVK6NOnTqZrvfHH39kepz75JNPIISAp6enVGZjYwMnJyecOHHCaBs2NjaZ3hhXqVTSMfhlHjx4gGvXruHTTz/F0aNHMW/ePPTv3x8bNmzA2rVrjRLg6ecEL54bpAsMDET9+vWlc4QuXbqgYcOGGD16dJYt4m7cuIG///4bYWFhRsfbKlWq4O233zY53t6/fx+3b9/O8VNxRERkTKPR4ODBgzh16hQqVaqEM2fO4KeffpLmjx8/Hu+99x7OnTuHQ4cOYebMmVluq1u3bvj111+NylQqldFxGwAGDRqEVatW5W0geeibb77B77//juDgYNSuXRtt2rRB+/btkZiYiGXLliEuLg6lSpWSzkns7Oykp+JfNH/+fFSpUkVKhN+4cQMXLlzAl19+afK5AIanx8qUKQMPDw94eHigbNmymDt3LvR6vZR7SL+el8lkUtmLLeYDAwMhk8lw9uxZzJ49W8pFVKtWzWSfL8tHWFtbQ6FQGDViSE5ORrdu3aDVatGmTZss101v1JDx9fbbbwMABg4cKJVZW1vD0dER8+fPz3JbRLmlNHcFiApCiRIlMi2fMmUK3NzcMHbsWJNWaBkpFAo4ODjA3d0dOp1OSmC/KC0tDSqVCvb29tKFMmA40K9cuRKjR4/GmjVrpIS1k5OTlEAHDC2s0i/0nZycjO6wp1+sBgcHG91hdnZ2NrmQzaxuUVFRePz4MaytrU0OrosXL0ajRo1w+/Zto3K9Xg+1Wo1atWpJn0+JEiVw6NAhLFu2DC1btkSDBg1MWo2/2Eo+KioKrq6uAAx3l5OSknDt2jW0a9dOukNdoUIFpKWlYf/+/ejYsaPRY+JKpRL+/v4mMWVm/vz52LBhA7Zv357t4+pKpRKjRo2SnjbYsGEDkpKS8Omnn0rL9O7dG3369EHnzp2zvDh/9OgR1q5diylTphh9zz7//HPs2LEDQ4cOxffff2/yNxk2bBiOHz+e6TYbNWqUZb0rVqz4yo/pERFZAqVSicGDB2Pz5s0ADDeylUolgoODoVaroVKpoFAoIJfL8fjxY8hkMly7dg3Jyclo3LgxGjRogJ49e2Lp0qXYuXOntF0XF5dM92dlZWV0zM/I2toaer1eOk4rFAo8fvwY3t7e0jKpqano3LkzvvvuuxzF9/DhQ3Ts2BGVK1fG0aNH4ejomOXnABi6YXN0dER0dDRGjx6NRYsWwcPDAwBw6NAhbNmyBXv37oVarTZqAJBe38wuyK2srEzOm27duoXNmzdj5cqVRuXr16+Hq6sr3n//fXTp0gXly5fHV199hWPHjknLlCtXDuPHj8eHH34IAJnu88CBAzh//jzOnDljVL5q1So0bdoUq1atwrhx40zWq1evXpZPpXl6emZaDhguxNndCxFR7qWkpGDixImIiIiAo6MjRo8ejbS0NHTt2hXfffc
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 6 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"=== 关键发现 ===\n",
|
|||
|
"1. 输出时间步到原始时间步的映射比例约为 4:1\n",
|
|||
|
"2. 简单映射(中心位置)的平均比例: 1.8x\n",
|
|||
|
"3. 保守映射(完整范围)的平均比例: 11.3x\n",
|
|||
|
"4. 平均音素长度: 128 ms\n",
|
|||
|
"5. 中位数音素长度: 160 ms\n",
|
|||
|
"\n",
|
|||
|
"=== 置信度计算原理解释 ===\n",
|
|||
|
"基于之前的代码分析,置信度计算如下:\n",
|
|||
|
"1. 对每个输出时间步t,计算softmax概率: softmax(log_probs[t])\n",
|
|||
|
"2. 取该时间步预测的音素token对应的概率作为置信度\n",
|
|||
|
"3. 对连续的相同音素时间步,计算平均置信度\n",
|
|||
|
"4. 这个置信度反映了模型对该音素预测的确信程度\n",
|
|||
|
"\n",
|
|||
|
"例如您的例子: confidence = 0.9999608993530273\n",
|
|||
|
"这表示模型对该音素'JH'的预测有极高的置信度(99.996%)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 验证时间戳复原算法的正确性\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"import torch\n",
|
|||
|
"\n",
|
|||
|
"# 创建一个模拟测试来验证我们的理解\n",
|
|||
|
"def test_unfold_behavior():\n",
|
|||
|
" \"\"\"测试PyTorch unfold的实际行为来验证我们的理解\"\"\"\n",
|
|||
|
" \n",
|
|||
|
" print(\"=== 验证PyTorch unfold行为 ===\")\n",
|
|||
|
" \n",
|
|||
|
" # 创建一个简单的序列用于测试\n",
|
|||
|
" original_length = 30\n",
|
|||
|
" patch_size = 14\n",
|
|||
|
" patch_stride = 4\n",
|
|||
|
" \n",
|
|||
|
" # 创建一个简单的测试张量 [batch=1, features=1, height=1, time_steps=30]\n",
|
|||
|
" test_tensor = torch.arange(original_length).float().unsqueeze(0).unsqueeze(0).unsqueeze(0)\n",
|
|||
|
" print(f\"原始张量形状: {test_tensor.shape}\")\n",
|
|||
|
" print(f\"原始张量内容: {test_tensor.squeeze().tolist()}\")\n",
|
|||
|
" \n",
|
|||
|
" # 应用unfold操作 (模拟RNN模型中的操作)\n",
|
|||
|
" unfolded = test_tensor.unfold(3, patch_size, patch_stride)\n",
|
|||
|
" print(f\"Unfold后形状: {unfolded.shape}\")\n",
|
|||
|
" \n",
|
|||
|
" # 分析每个输出时间步对应的输入\n",
|
|||
|
" num_patches = unfolded.shape[3]\n",
|
|||
|
" print(f\"输出时间步数: {num_patches}\")\n",
|
|||
|
" print()\n",
|
|||
|
" \n",
|
|||
|
" print(\"每个输出时间步对应的原始时间步:\")\n",
|
|||
|
" for i in range(min(10, num_patches)): # 只显示前10个\n",
|
|||
|
" patch = unfolded[0, 0, 0, i, :].tolist()\n",
|
|||
|
" start_idx = i * patch_stride\n",
|
|||
|
" end_idx = start_idx + patch_size - 1\n",
|
|||
|
" print(f\"输出 {i:2d}: 原始 [{start_idx:2d}:{end_idx+1:2d}] -> {patch}\")\n",
|
|||
|
" \n",
|
|||
|
" return unfolded\n",
|
|||
|
"\n",
|
|||
|
"# 运行测试\n",
|
|||
|
"unfolded_result = test_unfold_behavior()\n",
|
|||
|
"\n",
|
|||
|
"# 验证我们的转换函数\n",
|
|||
|
"def validate_conversion_function():\n",
|
|||
|
" \"\"\"验证我们的时间戳转换函数\"\"\"\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n=== 验证转换函数 ===\")\n",
|
|||
|
" \n",
|
|||
|
" # 测试几个具体案例\n",
|
|||
|
" test_cases = [\n",
|
|||
|
" (0, 0), # 第一个输出时间步\n",
|
|||
|
" (1, 1), # 第二个输出时间步\n",
|
|||
|
" (0, 1), # 跨越两个输出时间步\n",
|
|||
|
" (206, 207), # 您提供的例子\n",
|
|||
|
" (5, 10), # 较长的segment\n",
|
|||
|
" ]\n",
|
|||
|
" \n",
|
|||
|
" print(\"测试案例:\")\n",
|
|||
|
" print(f\"{'输出范围':10s} {'中心位置':15s} {'保守范围':15s} {'可能范围':15s} {'实际长度(ms)':12s}\")\n",
|
|||
|
" print(\"-\" * 80)\n",
|
|||
|
" \n",
|
|||
|
" for output_start, output_end in test_cases:\n",
|
|||
|
" conversion = convert_output_timestamp_to_original(output_start, output_end)\n",
|
|||
|
" \n",
|
|||
|
" output_range = f\"{output_start}-{output_end}\"\n",
|
|||
|
" center_pos = f\"{conversion['center_positions'][0]:.1f}-{conversion['center_positions'][1]:.1f}\"\n",
|
|||
|
" conservative = f\"{conversion['conservative_mapping'][0]}-{conversion['conservative_mapping'][1]}\"\n",
|
|||
|
" likely = f\"{conversion['likely_mapping'][0]}-{conversion['likely_mapping'][1]}\"\n",
|
|||
|
" duration_ms = conversion['output_duration'] * PATCH_STRIDE * ORIGINAL_BIN_MS\n",
|
|||
|
" \n",
|
|||
|
" print(f\"{output_range:10s} {center_pos:15s} {conservative:15s} {likely:15s} {duration_ms:8d}\")\n",
|
|||
|
"\n",
|
|||
|
"validate_conversion_function()\n",
|
|||
|
"\n",
|
|||
|
"# 如果有实际数据,进行更详细的验证\n",
|
|||
|
"if 'phoneme_data' in locals() and phoneme_data is not None:\n",
|
|||
|
" print(f\"\\n=== 实际数据验证 ===\")\n",
|
|||
|
" \n",
|
|||
|
" # 统计转换结果的分布\n",
|
|||
|
" all_output_durations = []\n",
|
|||
|
" all_simple_durations = []\n",
|
|||
|
" all_conservative_durations = []\n",
|
|||
|
" all_likely_durations = []\n",
|
|||
|
" \n",
|
|||
|
" for phoneme, segments in phoneme_data.items():\n",
|
|||
|
" for segment in segments[:100]: # 每个音素取前100个进行统计\n",
|
|||
|
" output_start = segment['start_time']\n",
|
|||
|
" output_end = segment['end_time']\n",
|
|||
|
" \n",
|
|||
|
" conversion = convert_output_timestamp_to_original(output_start, output_end)\n",
|
|||
|
" \n",
|
|||
|
" output_duration = conversion['output_duration']\n",
|
|||
|
" simple_duration = conversion['simple_mapping'][1] - conversion['simple_mapping'][0] + 1\n",
|
|||
|
" conservative_duration = conversion['conservative_mapping'][1] - conversion['conservative_mapping'][0] + 1\n",
|
|||
|
" likely_duration = conversion['likely_mapping'][1] - conversion['likely_mapping'][0] + 1\n",
|
|||
|
" \n",
|
|||
|
" all_output_durations.append(output_duration)\n",
|
|||
|
" all_simple_durations.append(simple_duration)\n",
|
|||
|
" all_conservative_durations.append(conservative_duration)\n",
|
|||
|
" all_likely_durations.append(likely_duration)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"转换结果统计 (基于 {len(all_output_durations)} 个segments):\")\n",
|
|||
|
" print(f\"输出时间步长度: 平均 {np.mean(all_output_durations):.1f}, 中位数 {np.median(all_output_durations):.1f}\")\n",
|
|||
|
" print(f\"简单映射长度: 平均 {np.mean(all_simple_durations):.1f}, 中位数 {np.median(all_simple_durations):.1f}\")\n",
|
|||
|
" print(f\"保守映射长度: 平均 {np.mean(all_conservative_durations):.1f}, 中位数 {np.median(all_conservative_durations):.1f}\")\n",
|
|||
|
" print(f\"可能映射长度: 平均 {np.mean(all_likely_durations):.1f}, 中位数 {np.median(all_likely_durations):.1f}\")\n",
|
|||
|
" \n",
|
|||
|
" # 转换比例分析\n",
|
|||
|
" simple_ratios = [s/o for s, o in zip(all_simple_durations, all_output_durations) if o > 0]\n",
|
|||
|
" conservative_ratios = [c/o for c, o in zip(all_conservative_durations, all_output_durations) if o > 0]\n",
|
|||
|
" likely_ratios = [l/o for l, o in zip(all_likely_durations, all_output_durations) if o > 0]\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n原始时间步 vs 输出时间步的比例:\")\n",
|
|||
|
" print(f\"简单映射: 平均比例 {np.mean(simple_ratios):.1f}x\")\n",
|
|||
|
" print(f\"保守映射: 平均比例 {np.mean(conservative_ratios):.1f}x\") \n",
|
|||
|
" print(f\"可能映射: 平均比例 {np.mean(likely_ratios):.1f}x\")\n",
|
|||
|
" \n",
|
|||
|
" # 创建可视化\n",
|
|||
|
" plt.figure(figsize=(15, 10))\n",
|
|||
|
" \n",
|
|||
|
" # 子图1: 长度分布对比\n",
|
|||
|
" plt.subplot(2, 3, 1)\n",
|
|||
|
" plt.hist([all_output_durations, all_simple_durations, all_conservative_durations, all_likely_durations], \n",
|
|||
|
" bins=30, alpha=0.7, label=['输出', '简单', '保守', '可能'])\n",
|
|||
|
" plt.xlabel('时间步长度')\n",
|
|||
|
" plt.ylabel('频次')\n",
|
|||
|
" plt.title('不同映射方法的长度分布')\n",
|
|||
|
" plt.legend()\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图2: 比例分布\n",
|
|||
|
" plt.subplot(2, 3, 2)\n",
|
|||
|
" plt.hist([simple_ratios, conservative_ratios, likely_ratios], \n",
|
|||
|
" bins=30, alpha=0.7, label=['简单', '保守', '可能'])\n",
|
|||
|
" plt.xlabel('原始/输出 时间步比例')\n",
|
|||
|
" plt.ylabel('频次')\n",
|
|||
|
" plt.title('时间步长度比例分布')\n",
|
|||
|
" plt.legend()\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图3: 输出时长 vs 简单映射时长散点图\n",
|
|||
|
" plt.subplot(2, 3, 3)\n",
|
|||
|
" plt.scatter(all_output_durations, all_simple_durations, alpha=0.5, s=10)\n",
|
|||
|
" plt.xlabel('输出时间步长度')\n",
|
|||
|
" plt.ylabel('简单映射时间步长度')\n",
|
|||
|
" plt.title('输出 vs 简单映射长度')\n",
|
|||
|
" plt.plot([0, max(all_output_durations)], [0, max(all_output_durations)*PATCH_STRIDE], 'r--', alpha=0.7, label=f'{PATCH_STRIDE}x线')\n",
|
|||
|
" plt.legend()\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图4: 输出时长 vs 保守映射时长散点图\n",
|
|||
|
" plt.subplot(2, 3, 4)\n",
|
|||
|
" plt.scatter(all_output_durations, all_conservative_durations, alpha=0.5, s=10, color='orange')\n",
|
|||
|
" plt.xlabel('输出时间步长度')\n",
|
|||
|
" plt.ylabel('保守映射时间步长度')\n",
|
|||
|
" plt.title('输出 vs 保守映射长度')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图5: 输出时长 vs 可能映射时长散点图\n",
|
|||
|
" plt.subplot(2, 3, 5)\n",
|
|||
|
" plt.scatter(all_output_durations, all_likely_durations, alpha=0.5, s=10, color='green')\n",
|
|||
|
" plt.xlabel('输出时间步长度')\n",
|
|||
|
" plt.ylabel('可能映射时间步长度')\n",
|
|||
|
" plt.title('输出 vs 可能映射长度')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图6: 实际时长分布(毫秒)\n",
|
|||
|
" plt.subplot(2, 3, 6)\n",
|
|||
|
" actual_durations_ms = [d * PATCH_STRIDE * ORIGINAL_BIN_MS for d in all_output_durations]\n",
|
|||
|
" plt.hist(actual_durations_ms, bins=50, alpha=0.7, color='purple')\n",
|
|||
|
" plt.xlabel('实际时长 (ms)')\n",
|
|||
|
" plt.ylabel('频次')\n",
|
|||
|
" plt.title('音素segment实际时长分布')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" plt.tight_layout()\n",
|
|||
|
" plt.show()\n",
|
|||
|
" \n",
|
|||
|
" # 关键洞察\n",
|
|||
|
" print(f\"\\n=== 关键发现 ===\")\n",
|
|||
|
" print(f\"1. 输出时间步到原始时间步的映射比例约为 {PATCH_STRIDE}:1\")\n",
|
|||
|
" print(f\"2. 简单映射(中心位置)的平均比例: {np.mean(simple_ratios):.1f}x\")\n",
|
|||
|
" print(f\"3. 保守映射(完整范围)的平均比例: {np.mean(conservative_ratios):.1f}x\")\n",
|
|||
|
" print(f\"4. 平均音素长度: {np.mean(actual_durations_ms):.0f} ms\")\n",
|
|||
|
" print(f\"5. 中位数音素长度: {np.median(actual_durations_ms):.0f} ms\")\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n=== 置信度计算原理解释 ===\")\n",
|
|||
|
"print(\"基于之前的代码分析,置信度计算如下:\")\n",
|
|||
|
"print(\"1. 对每个输出时间步t,计算softmax概率: softmax(log_probs[t])\")\n",
|
|||
|
"print(\"2. 取该时间步预测的音素token对应的概率作为置信度\")\n",
|
|||
|
"print(\"3. 对连续的相同音素时间步,计算平均置信度\")\n",
|
|||
|
"print(\"4. 这个置信度反映了模型对该音素预测的确信程度\")\n",
|
|||
|
"print()\n",
|
|||
|
"print(\"例如您的例子: confidence = 0.9999608993530273\")\n",
|
|||
|
"print(\"这表示模型对该音素'JH'的预测有极高的置信度(99.996%)\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"id": "beufhfhmaao",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 滑动窗口时间戳复原算法\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"# 首先理解PyTorch unfold的工作原理\n",
|
|||
|
"print(\"=== PyTorch unfold操作分析 ===\")\n",
|
|||
|
"print(\"unfold(dimension, size, step) 的含义:\")\n",
|
|||
|
"print(\"- dimension: 在哪个维度上进行unfold\")\n",
|
|||
|
"print(\"- size: 每个窗口的大小 (patch_size = 14)\")\n",
|
|||
|
"print(\"- step: 窗口间的步长 (patch_stride = 4)\")\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# 配置参数\n",
|
|||
|
"PATCH_SIZE = 14 # 滑动窗口大小\n",
|
|||
|
"PATCH_STRIDE = 4 # 滑动窗口步长\n",
|
|||
|
"ORIGINAL_BIN_MS = 20 # 原始时间bin大小(ms)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"配置参数:\")\n",
|
|||
|
"print(f\"- PATCH_SIZE: {PATCH_SIZE}\")\n",
|
|||
|
"print(f\"- PATCH_STRIDE: {PATCH_STRIDE}\")\n",
|
|||
|
"print(f\"- ORIGINAL_BIN_MS: {ORIGINAL_BIN_MS}\")\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# 模拟unfold操作来理解输出时间步与原始时间步的对应关系\n",
|
|||
|
"def analyze_unfold_mapping(original_length, patch_size, patch_stride):\n",
|
|||
|
" \"\"\"分析unfold操作后的时间步映射关系\"\"\"\n",
|
|||
|
" \n",
|
|||
|
" # 计算输出长度 (模拟unfold的行为)\n",
|
|||
|
" output_length = (original_length - patch_size) // patch_stride + 1\n",
|
|||
|
" \n",
|
|||
|
" print(f\"原始序列长度: {original_length}\")\n",
|
|||
|
" print(f\"输出序列长度: {output_length}\")\n",
|
|||
|
" print()\n",
|
|||
|
" \n",
|
|||
|
" # 为每个输出时间步计算对应的原始时间步范围\n",
|
|||
|
" mappings = []\n",
|
|||
|
" for output_idx in range(output_length):\n",
|
|||
|
" start_idx = output_idx * patch_stride\n",
|
|||
|
" end_idx = start_idx + patch_size - 1\n",
|
|||
|
" center_idx = (start_idx + end_idx) / 2\n",
|
|||
|
" \n",
|
|||
|
" mappings.append({\n",
|
|||
|
" 'output_idx': output_idx,\n",
|
|||
|
" 'original_start': start_idx,\n",
|
|||
|
" 'original_end': end_idx,\n",
|
|||
|
" 'original_center': center_idx,\n",
|
|||
|
" 'original_range': f\"[{start_idx}:{end_idx+1}]\"\n",
|
|||
|
" })\n",
|
|||
|
" \n",
|
|||
|
" return mappings, output_length\n",
|
|||
|
"\n",
|
|||
|
"# 分析一个示例序列\n",
|
|||
|
"print(\"=== 滑动窗口映射分析 ===\")\n",
|
|||
|
"original_length = 100 # 假设原始序列有100个时间步\n",
|
|||
|
"mappings, output_length = analyze_unfold_mapping(original_length, PATCH_SIZE, PATCH_STRIDE)\n",
|
|||
|
"\n",
|
|||
|
"print(\"前10个输出时间步对应的原始时间步范围:\")\n",
|
|||
|
"for i, mapping in enumerate(mappings[:10]):\n",
|
|||
|
" print(f\"输出 {mapping['output_idx']:2d} <- 原始 {mapping['original_range']:8s} (中心: {mapping['original_center']:4.1f})\")\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n后5个输出时间步:\")\n",
|
|||
|
"for mapping in mappings[-5:]:\n",
|
|||
|
" print(f\"输出 {mapping['output_idx']:2d} <- 原始 {mapping['original_range']:8s} (中心: {mapping['original_center']:4.1f})\")\n",
|
|||
|
"\n",
|
|||
|
"def convert_output_timestamp_to_original(output_start, output_end, patch_size=PATCH_SIZE, patch_stride=PATCH_STRIDE):\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" 将输出时间戳转换为原始数据时间戳\n",
|
|||
|
" \n",
|
|||
|
" Args:\n",
|
|||
|
" output_start: 输出序列中的开始时间步\n",
|
|||
|
" output_end: 输出序列中的结束时间步\n",
|
|||
|
" patch_size: 滑动窗口大小\n",
|
|||
|
" patch_stride: 滑动窗口步长\n",
|
|||
|
" \n",
|
|||
|
" Returns:\n",
|
|||
|
" dict: 包含原始时间戳信息的字典\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" \n",
|
|||
|
" # 计算输出时间步对应的原始时间步中心位置\n",
|
|||
|
" original_start_center = output_start * patch_stride + (patch_size - 1) / 2\n",
|
|||
|
" original_end_center = output_end * patch_stride + (patch_size - 1) / 2\n",
|
|||
|
" \n",
|
|||
|
" # 由于每个输出时间步对应一个patch,我们需要考虑如何处理重叠\n",
|
|||
|
" # 最简单的方法:使用中心位置\n",
|
|||
|
" original_start_simple = int(round(original_start_center))\n",
|
|||
|
" original_end_simple = int(round(original_end_center))\n",
|
|||
|
" \n",
|
|||
|
" # 更精确的方法:考虑实际的音素边界\n",
|
|||
|
" # 输出时间步的范围对应的原始patch范围\n",
|
|||
|
" patch_start_first = output_start * patch_stride\n",
|
|||
|
" patch_end_first = patch_start_first + patch_size - 1\n",
|
|||
|
" \n",
|
|||
|
" patch_start_last = output_end * patch_stride \n",
|
|||
|
" patch_end_last = patch_start_last + patch_size - 1\n",
|
|||
|
" \n",
|
|||
|
" # 保守估计:音素可能存在于重叠区域的任何位置\n",
|
|||
|
" conservative_start = patch_start_first\n",
|
|||
|
" conservative_end = patch_end_last\n",
|
|||
|
" \n",
|
|||
|
" # 最可能的范围:基于中心位置但考虑patch边界\n",
|
|||
|
" likely_start = max(patch_start_first, int(original_start_center - patch_stride/2))\n",
|
|||
|
" likely_end = min(patch_end_last, int(original_end_center + patch_stride/2))\n",
|
|||
|
" \n",
|
|||
|
" return {\n",
|
|||
|
" 'output_range': (output_start, output_end),\n",
|
|||
|
" 'output_duration': output_end - output_start + 1,\n",
|
|||
|
" 'simple_mapping': (original_start_simple, original_end_simple),\n",
|
|||
|
" 'conservative_mapping': (conservative_start, conservative_end),\n",
|
|||
|
" 'likely_mapping': (likely_start, likely_end),\n",
|
|||
|
" 'center_positions': (original_start_center, original_end_center),\n",
|
|||
|
" 'patch_ranges': {\n",
|
|||
|
" 'first_patch': (patch_start_first, patch_end_first),\n",
|
|||
|
" 'last_patch': (patch_start_last, patch_end_last)\n",
|
|||
|
" }\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n=== 时间戳转换示例 ===\")\n",
|
|||
|
"\n",
|
|||
|
"# 使用您提供的例子\n",
|
|||
|
"example_output_start = 206\n",
|
|||
|
"example_output_end = 207\n",
|
|||
|
"\n",
|
|||
|
"conversion = convert_output_timestamp_to_original(example_output_start, example_output_end)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"示例: 输出时间戳 {example_output_start}-{example_output_end}\")\n",
|
|||
|
"print(f\"输出持续时间: {conversion['output_duration']} 个输出时间步\")\n",
|
|||
|
"print(f\"实际时长: {conversion['output_duration'] * PATCH_STRIDE * ORIGINAL_BIN_MS} ms\")\n",
|
|||
|
"print()\n",
|
|||
|
"print(f\"转换结果:\")\n",
|
|||
|
"print(f\"1. 简单映射(中心位置): {conversion['simple_mapping']}\")\n",
|
|||
|
"print(f\"2. 保守映射(完整范围): {conversion['conservative_mapping']}\") \n",
|
|||
|
"print(f\"3. 可能映射(调整范围): {conversion['likely_mapping']}\")\n",
|
|||
|
"print()\n",
|
|||
|
"print(f\"详细信息:\")\n",
|
|||
|
"print(f\"- 中心位置: {conversion['center_positions']}\")\n",
|
|||
|
"print(f\"- 第一个patch范围: {conversion['patch_ranges']['first_patch']}\")\n",
|
|||
|
"print(f\"- 最后一个patch范围: {conversion['patch_ranges']['last_patch']}\")\n",
|
|||
|
"\n",
|
|||
|
"# 如果有phoneme_data,分析实际数据\n",
|
|||
|
"if 'phoneme_data' in locals() and phoneme_data is not None:\n",
|
|||
|
" print(f\"\\n=== 实际数据转换分析 ===\")\n",
|
|||
|
" \n",
|
|||
|
" # 随机选择一些segments进行分析\n",
|
|||
|
" sample_conversions = []\n",
|
|||
|
" \n",
|
|||
|
" count = 0\n",
|
|||
|
" for phoneme, segments in phoneme_data.items():\n",
|
|||
|
" for segment in segments[:3]: # 每个音素取前3个\n",
|
|||
|
" if count >= 10: # 只分析前10个\n",
|
|||
|
" break\n",
|
|||
|
" \n",
|
|||
|
" output_start = segment['start_time']\n",
|
|||
|
" output_end = segment['end_time']\n",
|
|||
|
" \n",
|
|||
|
" conversion = convert_output_timestamp_to_original(output_start, output_end)\n",
|
|||
|
" conversion['phoneme'] = phoneme\n",
|
|||
|
" conversion['session'] = segment.get('session', 'unknown')\n",
|
|||
|
" conversion['trial_num'] = segment.get('trial_num', -1)\n",
|
|||
|
" \n",
|
|||
|
" sample_conversions.append(conversion)\n",
|
|||
|
" count += 1\n",
|
|||
|
" \n",
|
|||
|
" if count >= 10:\n",
|
|||
|
" break\n",
|
|||
|
" \n",
|
|||
|
" print(f\"前10个segment的时间戳转换:\")\n",
|
|||
|
" print(f\"{'音素':4s} {'输出':8s} {'简单映射':12s} {'保守映射':12s} {'可能映射':12s} {'实际时长(ms)':10s}\")\n",
|
|||
|
" print(\"-\" * 70)\n",
|
|||
|
" \n",
|
|||
|
" for conv in sample_conversions:\n",
|
|||
|
" output_range = f\"{conv['output_range'][0]}-{conv['output_range'][1]}\"\n",
|
|||
|
" simple = f\"{conv['simple_mapping'][0]}-{conv['simple_mapping'][1]}\"\n",
|
|||
|
" conservative = f\"{conv['conservative_mapping'][0]}-{conv['conservative_mapping'][1]}\"\n",
|
|||
|
" likely = f\"{conv['likely_mapping'][0]}-{conv['likely_mapping'][1]}\"\n",
|
|||
|
" duration_ms = conv['output_duration'] * PATCH_STRIDE * ORIGINAL_BIN_MS\n",
|
|||
|
" \n",
|
|||
|
" print(f\"{conv['phoneme']:4s} {output_range:8s} {simple:12s} {conservative:12s} {likely:12s} {duration_ms:6d}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"id": "a530adfb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# 设置中文字体支持\n",
|
|||
|
"plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'sans-serif']\n",
|
|||
|
"plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题\n",
|
|||
|
"\n",
|
|||
|
"print(\"已启用matplotlib中文支持\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "mks6harmjq",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"=== 滑动窗口参数分析 ===\n",
|
|||
|
"输入patch大小: 14 时间步\n",
|
|||
|
"滑动窗口步长: 4 时间步\n",
|
|||
|
"相邻窗口重叠: 10 时间步\n",
|
|||
|
"原始数据分辨率: 20 ms/bin\n",
|
|||
|
"输出数据分辨率: 80 ms/output_step\n",
|
|||
|
"\n",
|
|||
|
"=== 时间步长度统计 ===\n",
|
|||
|
"总segment数量: 214218\n",
|
|||
|
"输出时间步长度:\n",
|
|||
|
" 最小: 1 步\n",
|
|||
|
" 最大: 12 步\n",
|
|||
|
" 平均: 2.1 步\n",
|
|||
|
" 中位数: 2.0 步\n",
|
|||
|
"\n",
|
|||
|
"对应的实际时长(毫秒):\n",
|
|||
|
" 最小: 80 ms\n",
|
|||
|
" 最大: 960 ms\n",
|
|||
|
" 平均: 170.4 ms\n",
|
|||
|
" 中位数: 160.0 ms\n",
|
|||
|
"\n",
|
|||
|
"输出时间步长度分布 (前10个):\n",
|
|||
|
" 2 步 (160ms): 78068 个segment (36.4%)\n",
|
|||
|
" 1 步 ( 80ms): 74821 个segment (34.9%)\n",
|
|||
|
" 3 步 (240ms): 30313 个segment (14.2%)\n",
|
|||
|
" 4 步 (320ms): 21718 个segment (10.1%)\n",
|
|||
|
" 5 步 (400ms): 8254 个segment (3.9%)\n",
|
|||
|
" 6 步 (480ms): 977 个segment (0.5%)\n",
|
|||
|
" 7 步 (560ms): 59 个segment (0.0%)\n",
|
|||
|
" 8 步 (640ms): 7 个segment (0.0%)\n",
|
|||
|
"12 步 (960ms): 1 个segment (0.0%)\n",
|
|||
|
"\n",
|
|||
|
"=== 示例分析 ===\n",
|
|||
|
"示例segment: start_time=206, end_time=207\n",
|
|||
|
"输出时间步数: 2\n",
|
|||
|
"实际时长: 160 ms\n",
|
|||
|
"对应原始数据范围: bin 824 到 bin 841\n",
|
|||
|
"原始数据时间步数: 18\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAMWCAYAAAAgRDUeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XucjHX/x/H37Nnu2tbag9PGYuummxKLdKNFKUKhg0Phbosk7sRd3UVRVCqkHyJCjkVJJ7ecQm4dSA4tihDrNE47s7t29jDX74+9d25jZ1manZm1r+fjMY/u6/pe1zWf6/u113zuz3UyGYZhCAAAAAAAAPAgP28HAAAAAAAAgPKHohQAAAAAAAA8jqIUAAAAAAAAPI6iFAAAAAAAADyOohQAAAAAAAA8jqIUAAAAAAAAPI6iFAAAAAAAADyOohQAAAAAAAA8jqIUgCLy8vK8HUKJ5ebmejsEAAAAt0hPT9fvv//u7TAAwGMoSgHl1Pbt29W/f3+lpaUVaRszZoxuvPFGpaenX9Y2O3furKVLl7orREnS0aNHVatWLa1bt65I27Fjx1SnTh19+umnkqT8/HxlZmYqJydHhmFc8XeePXtWM2bM0NmzZy8rzlWrVik/P79I25kzZy47hrVr16pZs2bKysq6rPUMw9CRI0cc07///rtmzZp12d+fm5urm2++WbNnz77sdQEAwJVp166d+vTpc9Flzp07pyVLljh9Pv74Y9ntdp05c8blycWsrCxlZ2dfUUwXy8UKpaam6ocffnDZdjl50OHDhx3/+9ixY5o8eXLJAz1Pu3bt9Prrr1/RugA8i6IUUE7t2LFDH374oWJiYpzmnzlzRhMnTtS5c+f07LPPFrv+t99+q6NHjzrN27ZtW5HE4/fff9exY8ec5p09e1YHDhxw+pxfSDmfn5+fDh486LLYU6VKFXXs2FH9+vXTgQMHtGnTJoWHhys4OFh+fn4ymUxOn8jIyIt1icPJkyf16KOPXlYStX79et1+++2y2+1O848dO6YbbrhBb775ZrHr5ufnKy8vz+lz7bXX6ocfftCSJUsc83JycmS1WpWZmVnstr744gvVrVtXv/zyi6SCJDElJUV79uwp8b5I0urVq7V161bVqFHjstYDAABX7qGHHtK3336rXbt2FbtMenq67rvvPj3//PN67bXX1L9/f40ePVp+fn5q1KiRAgMDi+RAYWFh+uKLLxzbcFcuVuidd97RU089VWT+mjVrVKdOHa1du/aS+75jxw7Vrl1bX3/9tSTJbDbrySef1MqVKy+57vl2796t1atXq3Llype1HgDvCPB2AAA8Kzc3VytWrNA333yjOnXqaMOGDcrKylKnTp0kSf/4xz/Upk0bTZs2TTfffLNGjBihl19+uch2unTpogkTJujhhx92zAsMDJSfn3Otu2/fvvrb3/6msWPHOuZNmjRJL774otNyN954o37++eci3xMQEOD030JWq1X5+fl66623dPr0aR08eFBNmzbV3r17FRwcrODgYJlMJsfyc+bMKfaM2blz51SxYkV99dVXuuOOOxzflZaW5kjACgtDf/3rX11uIzg4WP7+/goMDHTMy8rKUpcuXZSXl6e2bdu6XE+SwsLCZLPZXLb16dOnyBnTZ555Rq+99prL5V999VXdfPPNuuGGGyRJd999t5o0aaJBgwbp66+/duqTQjt27FBISIgCAgIc7ZMmTdJ1112nunXr6sCBA07L2+125eTkKDQ0VNdee22x+wUAAFzbuHGj/va3vxXbXr9+/SLzAgMDlZOTo6CgIEnStGnTdNttt+nGG29Uhw4dJEnff/+9y3wsNzdX4eHhjml35GLnCw4OVoUKFZzm7dmzR/fdd58SExN14403FrtuobFjx6pGjRpKTk6WJDVo0EDdu3fXU089pS1btig4OLjIOrt375a/v79TDjN+/HhFRkaqVatWLnOY3Nxc+fv7q27dupeMCUDpoygFlDPnzp3T0KFDdfToUUVERGjQoEHKyclRp06d9OGHH+qbb77R1q1bFRUVpWXLlik5OVm7du3S//3f/6lKlSqO7YSEhCg0NPSS3xcUFORIngoFBASoTp062rt3ryTp2Wef1apVq3TgwAEdP35cgYGBCggIkJ+fnywWi6SCK66Cg4NVt25dVa5cWYMHD9bOnTv1448/6sMPP3Rsu06dOi7jCAsLKxLH+fuSn5/vSHb8/f0lFdyOWJjU5eTkKCwsrMjVYcWx2Wzq3r279u/frzVr1hRbzCrsjzlz5qhjx46SpMGDB+vOO+90JJiHDx9W3759NW3aNFWvXt1lUiZJy5Yt06ZNm7Rhwwan+ZMmTVKLFi00adIkDRkypMh6jRo1KvbsZ0JCQrFx9+nTh9v7AAC4AoUFnPXr16tBgwZF2vPz8x35iCS9+eabmjRpkiQ55TOHDh3S9u3bNWXKFElSXFxcib7fHbnYxfzxxx+68847lZiYqK+//loREREXXX7Lli1avHixZs+e7XSCb9y4cfrrX/+qZ599VhMmTCiy3t133619+/a53Ob1119f7Pe1bt1a33zzzUVjAuAZFKWAciYiIkK//vqr4uPjNXv2bMcVPOvXr9djjz2mhIQEp0SjcePG2rRpk6677jr1799fr7zySpGrkC7X+clGoYCAAL333nuaMGGC4wzf+Wf5Bg8erLy8PC1atEidO3dWRESEI6Gz2+2OhOlCoaGhxRajChXuy/nJn1Tw3K0ruX0tIyND9913n7Zs2XLJgpRUcFl8SEiI47/fffedmjRp4jijGRoaqu+//16hoaGKjIx0WUA6ffq0BgwYoPvvv7/ImddmzZrpmWee0dNPP62qVavq/vvvd2pPT093XOllMpmUnJwsf39/rVq1Sv/+97915MgR/f3vf7/sfgAAAK4VPt+pYsWKOn36tBYvXqynn37acTVSYmKievbsqVGjRkmSoqKinK50KvTll18qJiZGt9xyi7Kyshy/5xcyDEPZ2dmOYpM7crHi7N27V3fccYeqVKmiFStWXLIglZOTo379+qlJkybq1auXU1utWrU0fvx49e/fX/Hx8Ro6dKhT+/bt2xUYGCh/f3/5+fmpb9+++v7777Vt2zb99NNP+s9//lNkHQC+hWdKAeXQ7t27dfr0aTVr1kx5eXlavHix2rdvr9dff13/+c9/dObMGcdn/fr1+v7775WUlKRDhw4Ve5XO5bjwkvLCeWPGjFFWVpbS09N15swZnTp1ynH266uvvlJWVpYjCTr/0vRjx46pUqVKLj8zZ850GUN+fr7OnTtX5BlQkootuBmGIZvNdsm3Ez788MP66aeftHbt2ksWpAqZzWaFhYXJ399fu3bt0hNPPOF4DkTNmjUlSdddd50CAwOLXO1kGIYee+wxSdL//d//udz+K6+8ol69eqlHjx4aPXq0U2ErLCzMcdn77Nmz9f333zvOuK5du1bz588v0T4AAICSadq0qc6cOaMGDRpoz549euutt9S4cWP9+OOPkgqKVufnG0OHDnX5zKdly5apY8eO8vPzU0REhOP3/MKPn5+fQkNDHc+Vckcu5kpWVpaSk5NLXJCSpOeee0779u3T7NmzXeZgjz32mJ5//nk9/fTTGjhwoNMD20NDQx054dq1azV37lxNmzZNQUFB2rp16xU/KB2A53ClFFAO/fvf/1b79u3Vv39/1a5dW23btlVycrI2b96szZs3S5JuueUWPfLII5IKrvxp0qSJ04PPc3JydN999xXZdr9+/dSvXz+neU2bNnWaNplM2rdvn1Pi0aJFC0kFVz0dOHBAtWvXLvH+FF4xtWPHDqciUN26dRUSEuJynQ0bNjieWVCoZcuWTtPx8fEu1124cKEefPBB7dy503EWsfAthrt371ZKSop69+4tf39/7d6922nd3NxchYaGOt1maLfbVa1aNWVkZCgkJEQNGzbUk08+qQEDBkgquH0vPj5ev//+u2rUqFGkkDZ8+HAtW7ZM//d//6fjx4/r1KlTLuN+9tlnFRQUpBdffFHLli3TkiVLnG7P+/nnn/XEE0/omWeeUWhoqA4fPqxz584
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x800 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"=== 总结 ===\n",
|
|||
|
"1. RNN使用14时间步的滑动窗口,步长为4\n",
|
|||
|
"2. segment中的时间戳是输出时间步,不是原始时间步\n",
|
|||
|
"3. 每个输出时间步对应原始数据的4个时间步(80ms)\n",
|
|||
|
"4. 输出时间戳206-207表示2个输出步长,对应160ms实际时长\n",
|
|||
|
"5. 这2个输出步长实际上对应原始数据中更多的时间步(由于滑动窗口重叠)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 分析滑动窗口与时间戳的关系\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"# 从配置文件我们知道:\n",
|
|||
|
"PATCH_SIZE = 14 # 每个输入patch包含14个时间步\n",
|
|||
|
"PATCH_STRIDE = 4 # 滑动窗口步长为4个时间步\n",
|
|||
|
"ORIGINAL_BIN_SIZE_MS = 20 # 原始数据每个bin为20ms\n",
|
|||
|
"\n",
|
|||
|
"print(\"=== 滑动窗口参数分析 ===\")\n",
|
|||
|
"print(f\"输入patch大小: {PATCH_SIZE} 时间步\")\n",
|
|||
|
"print(f\"滑动窗口步长: {PATCH_STRIDE} 时间步\")\n",
|
|||
|
"print(f\"相邻窗口重叠: {PATCH_SIZE - PATCH_STRIDE} 时间步\")\n",
|
|||
|
"print(f\"原始数据分辨率: {ORIGINAL_BIN_SIZE_MS} ms/bin\")\n",
|
|||
|
"print(f\"输出数据分辨率: {ORIGINAL_BIN_SIZE_MS * PATCH_STRIDE} ms/output_step\")\n",
|
|||
|
"\n",
|
|||
|
"# 分析时间步长度分布\n",
|
|||
|
"if 'phoneme_data' in locals() and phoneme_data is not None:\n",
|
|||
|
" durations = []\n",
|
|||
|
" output_durations_ms = []\n",
|
|||
|
" original_durations_ms = []\n",
|
|||
|
" \n",
|
|||
|
" for phoneme, segments in phoneme_data.items():\n",
|
|||
|
" for segment in segments:\n",
|
|||
|
" # 输出时间步长度\n",
|
|||
|
" output_duration = segment['end_time'] - segment['start_time'] + 1\n",
|
|||
|
" durations.append(output_duration)\n",
|
|||
|
" \n",
|
|||
|
" # 对应的输出时长(毫秒)\n",
|
|||
|
" output_duration_ms = output_duration * ORIGINAL_BIN_SIZE_MS * PATCH_STRIDE\n",
|
|||
|
" output_durations_ms.append(output_duration_ms)\n",
|
|||
|
" \n",
|
|||
|
" # 对应的原始数据时长(毫秒)\n",
|
|||
|
" original_duration_ms = output_duration * PATCH_STRIDE * ORIGINAL_BIN_SIZE_MS\n",
|
|||
|
" original_durations_ms.append(original_duration_ms)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n=== 时间步长度统计 ===\")\n",
|
|||
|
" print(f\"总segment数量: {len(durations)}\")\n",
|
|||
|
" print(f\"输出时间步长度:\")\n",
|
|||
|
" print(f\" 最小: {min(durations)} 步\")\n",
|
|||
|
" print(f\" 最大: {max(durations)} 步\")\n",
|
|||
|
" print(f\" 平均: {np.mean(durations):.1f} 步\")\n",
|
|||
|
" print(f\" 中位数: {np.median(durations):.1f} 步\")\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n对应的实际时长(毫秒):\")\n",
|
|||
|
" print(f\" 最小: {min(output_durations_ms)} ms\")\n",
|
|||
|
" print(f\" 最大: {max(output_durations_ms)} ms\") \n",
|
|||
|
" print(f\" 平均: {np.mean(output_durations_ms):.1f} ms\")\n",
|
|||
|
" print(f\" 中位数: {np.median(output_durations_ms):.1f} ms\")\n",
|
|||
|
" \n",
|
|||
|
" # 长度分布\n",
|
|||
|
" from collections import Counter\n",
|
|||
|
" duration_counts = Counter(durations)\n",
|
|||
|
" print(f\"\\n输出时间步长度分布 (前10个):\")\n",
|
|||
|
" for length, count in duration_counts.most_common(10):\n",
|
|||
|
" actual_ms = length * ORIGINAL_BIN_SIZE_MS * PATCH_STRIDE\n",
|
|||
|
" print(f\"{length:2d} 步 ({actual_ms:3d}ms): {count:4d} 个segment ({count/len(durations)*100:.1f}%)\")\n",
|
|||
|
" \n",
|
|||
|
" # 分析您提到的例子\n",
|
|||
|
" print(f\"\\n=== 示例分析 ===\")\n",
|
|||
|
" example_segment = {'start_time': 206, 'end_time': 207}\n",
|
|||
|
" output_steps = example_segment['end_time'] - example_segment['start_time'] + 1\n",
|
|||
|
" actual_duration_ms = output_steps * ORIGINAL_BIN_SIZE_MS * PATCH_STRIDE\n",
|
|||
|
" original_start_bin = example_segment['start_time'] * PATCH_STRIDE\n",
|
|||
|
" original_end_bin = example_segment['end_time'] * PATCH_STRIDE + PATCH_SIZE - 1\n",
|
|||
|
" \n",
|
|||
|
" print(f\"示例segment: start_time={example_segment['start_time']}, end_time={example_segment['end_time']}\")\n",
|
|||
|
" print(f\"输出时间步数: {output_steps}\")\n",
|
|||
|
" print(f\"实际时长: {actual_duration_ms} ms\")\n",
|
|||
|
" print(f\"对应原始数据范围: bin {original_start_bin} 到 bin {original_end_bin}\")\n",
|
|||
|
" print(f\"原始数据时间步数: {original_end_bin - original_start_bin + 1}\")\n",
|
|||
|
" \n",
|
|||
|
" # 可视化长度分布\n",
|
|||
|
" plt.figure(figsize=(12, 8))\n",
|
|||
|
" \n",
|
|||
|
" # 子图1: 输出时间步长度分布\n",
|
|||
|
" plt.subplot(2, 2, 1)\n",
|
|||
|
" plt.hist(durations, bins=50, alpha=0.7, edgecolor='black')\n",
|
|||
|
" plt.xlabel('输出时间步数')\n",
|
|||
|
" plt.ylabel('频次')\n",
|
|||
|
" plt.title('输出时间步长度分布')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图2: 实际时长分布\n",
|
|||
|
" plt.subplot(2, 2, 2)\n",
|
|||
|
" plt.hist(output_durations_ms, bins=50, alpha=0.7, color='orange', edgecolor='black')\n",
|
|||
|
" plt.xlabel('实际时长 (ms)')\n",
|
|||
|
" plt.ylabel('频次')\n",
|
|||
|
" plt.title('实际时长分布')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图3: 时长vs频次散点图\n",
|
|||
|
" plt.subplot(2, 2, 3)\n",
|
|||
|
" unique_durations = list(duration_counts.keys())\n",
|
|||
|
" counts = [duration_counts[d] for d in unique_durations]\n",
|
|||
|
" plt.scatter(unique_durations, counts, alpha=0.7)\n",
|
|||
|
" plt.xlabel('输出时间步数')\n",
|
|||
|
" plt.ylabel('出现次数')\n",
|
|||
|
" plt.title('时长频次散点图')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" # 子图4: 累积分布\n",
|
|||
|
" plt.subplot(2, 2, 4)\n",
|
|||
|
" sorted_durations = sorted(durations)\n",
|
|||
|
" cumulative = np.arange(1, len(sorted_durations) + 1) / len(sorted_durations)\n",
|
|||
|
" plt.plot(sorted_durations, cumulative)\n",
|
|||
|
" plt.xlabel('输出时间步数')\n",
|
|||
|
" plt.ylabel('累积概率')\n",
|
|||
|
" plt.title('时长累积分布函数')\n",
|
|||
|
" plt.grid(True, alpha=0.3)\n",
|
|||
|
" \n",
|
|||
|
" plt.tight_layout()\n",
|
|||
|
" plt.show()\n",
|
|||
|
" \n",
|
|||
|
"else:\n",
|
|||
|
" print(\"phoneme_data 未加载,无法进行详细分析\")\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n=== 总结 ===\")\n",
|
|||
|
"print(\"1. RNN使用14时间步的滑动窗口,步长为4\")\n",
|
|||
|
"print(\"2. segment中的时间戳是输出时间步,不是原始时间步\")\n",
|
|||
|
"print(\"3. 每个输出时间步对应原始数据的4个时间步(80ms)\")\n",
|
|||
|
"print(\"4. 输出时间戳206-207表示2个输出步长,对应160ms实际时长\")\n",
|
|||
|
"print(\"5. 这2个输出步长实际上对应原始数据中更多的时间步(由于滑动窗口重叠)\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "ea2c9908",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"找到 4 个PKL文件:\n",
|
|||
|
" ctc_results_20251008_235600.pkl (25.6 MB)\n",
|
|||
|
" ctc_results_20251009_202457.pkl (23.9 MB)\n",
|
|||
|
" phoneme_dataset_20251008_235600.pkl (21.1 MB)\n",
|
|||
|
" phoneme_dataset_20251009_202457.pkl (19.1 MB)\n",
|
|||
|
"\n",
|
|||
|
"分析最新的phoneme dataset: phoneme_dataset_20251009_202457.pkl\n",
|
|||
|
"=== 分析文件: phoneme_dataset_20251009_202457.pkl ===\n",
|
|||
|
"数据类型: <class 'dict'>\n",
|
|||
|
"字典键数量: 40\n",
|
|||
|
"字典键: ['B', 'R', 'IH', 'NG', ' | ', 'T', 'K', 'L', 'OW', 'S']...\n",
|
|||
|
"总segment数量: 214218\n",
|
|||
|
"\n",
|
|||
|
"音素分布 (前20个):\n",
|
|||
|
" ' | ': 50607 segments\n",
|
|||
|
" 'T': 13049 segments\n",
|
|||
|
" 'AH': 12663 segments\n",
|
|||
|
" 'IH': 10561 segments\n",
|
|||
|
" 'N': 9368 segments\n",
|
|||
|
" 'S': 7054 segments\n",
|
|||
|
" 'R': 6865 segments\n",
|
|||
|
" 'L': 6429 segments\n",
|
|||
|
" 'IY': 6278 segments\n",
|
|||
|
" 'D': 6260 segments\n",
|
|||
|
" 'AY': 5337 segments\n",
|
|||
|
" 'DH': 5259 segments\n",
|
|||
|
" 'K': 5178 segments\n",
|
|||
|
" 'M': 5125 segments\n",
|
|||
|
" 'UW': 4802 segments\n",
|
|||
|
" 'EH': 4589 segments\n",
|
|||
|
" 'AE': 4373 segments\n",
|
|||
|
" 'Z': 4348 segments\n",
|
|||
|
" 'W': 4100 segments\n",
|
|||
|
" 'AA': 3403 segments\n",
|
|||
|
"\n",
|
|||
|
"=== Segment结构分析 ===\n",
|
|||
|
"示例segment结构:\n",
|
|||
|
" phoneme: B (<class 'str'>)\n",
|
|||
|
" start_time: 17 (<class 'int'>)\n",
|
|||
|
" end_time: 18 (<class 'int'>)\n",
|
|||
|
" confidence: 0.8151801824569702 (<class 'numpy.float64'>)\n",
|
|||
|
" session: t15.2023.08.11 (<class 'str'>)\n",
|
|||
|
" block_num: 2 (<class 'numpy.int64'>)\n",
|
|||
|
" trial_num: 0 (<class 'numpy.int64'>)\n",
|
|||
|
" corpus: 50-Word (<class 'str'>)\n",
|
|||
|
"\n",
|
|||
|
"字段统计:\n",
|
|||
|
" 不同session数: 1\n",
|
|||
|
" 不同corpus数: 1\n",
|
|||
|
" 平均持续时间: 1.7 time steps\n",
|
|||
|
" 平均置信度: 0.976\n",
|
|||
|
"\n",
|
|||
|
"=== Session分布分析 ===\n",
|
|||
|
"Session统计:\n",
|
|||
|
" t15.2023.08.11:\n",
|
|||
|
" Segments: 4771\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 34\n",
|
|||
|
" t15.2023.08.13:\n",
|
|||
|
" Segments: 8607\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.08.18:\n",
|
|||
|
" Segments: 5281\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.08.20:\n",
|
|||
|
" Segments: 7333\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.08.25:\n",
|
|||
|
" Segments: 2319\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.08.27:\n",
|
|||
|
" Segments: 4064\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.09.01:\n",
|
|||
|
" Segments: 7875\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.09.03:\n",
|
|||
|
" Segments: 8962\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.09.24:\n",
|
|||
|
" Segments: 6282\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.09.29:\n",
|
|||
|
" Segments: 4035\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.01:\n",
|
|||
|
" Segments: 5869\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.06:\n",
|
|||
|
" Segments: 4510\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.08:\n",
|
|||
|
" Segments: 7583\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.13:\n",
|
|||
|
" Segments: 3903\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.15:\n",
|
|||
|
" Segments: 6246\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.20:\n",
|
|||
|
" Segments: 2584\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.10.22:\n",
|
|||
|
" Segments: 4172\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.11.03:\n",
|
|||
|
" Segments: 3734\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.11.04:\n",
|
|||
|
" Segments: 1732\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.11.17:\n",
|
|||
|
" Segments: 2443\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.11.19:\n",
|
|||
|
" Segments: 1358\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.11.26:\n",
|
|||
|
" Segments: 5678\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.12.03:\n",
|
|||
|
" Segments: 6631\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.12.08:\n",
|
|||
|
" Segments: 6024\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.12.10:\n",
|
|||
|
" Segments: 3732\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.12.17:\n",
|
|||
|
" Segments: 3976\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2023.12.29:\n",
|
|||
|
" Segments: 5764\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.02.25:\n",
|
|||
|
" Segments: 5424\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.03.03:\n",
|
|||
|
" Segments: 4121\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.03.08:\n",
|
|||
|
" Segments: 5090\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.03.15:\n",
|
|||
|
" Segments: 7502\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.03.17:\n",
|
|||
|
" Segments: 7940\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.04.25:\n",
|
|||
|
" Segments: 6605\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 39\n",
|
|||
|
" t15.2024.04.28:\n",
|
|||
|
" Segments: 2575\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 34\n",
|
|||
|
" t15.2024.05.10:\n",
|
|||
|
" Segments: 3012\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.06.14:\n",
|
|||
|
" Segments: 2524\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.07.19:\n",
|
|||
|
" Segments: 5249\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.07.21:\n",
|
|||
|
" Segments: 4687\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2024.07.28:\n",
|
|||
|
" Segments: 4603\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.01.10:\n",
|
|||
|
" Segments: 3078\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.01.12:\n",
|
|||
|
" Segments: 4819\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.03.14:\n",
|
|||
|
" Segments: 1811\n",
|
|||
|
" Trials: 25\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.03.16:\n",
|
|||
|
" Segments: 2812\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.03.30:\n",
|
|||
|
" Segments: 4856\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" t15.2025.04.13:\n",
|
|||
|
" Segments: 2042\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAJOCAYAAABm7rQwAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAhF9JREFUeJzs3XdcVuX/x/H3DShDBURNxUxRKcVZLty40hy4sjRMrXBkVlZiajkyc+XKHLmSMr9qmn61TNFcmJGFAweOUsmtuIBAAeH8/vDr/ZPARNRzA76ej8f9eHSu65zD50MC+uY6120xDMMQAAAAAAAAYCI7WxcAAAAAAACARw+hFAAAAAAAAExHKAUAAAAAAADTEUoBAAAAAADAdIRSAAAAAAAAMB2hFAAAAAAAAExHKAUAAAAAAADTEUoBAAAAAADAdIRSAAAAAAAAMB2hFAAAyNHat2+vvn373tc9Vq9erd9+++2er7t27ZouX75sPd67d6++/fbbe77P+fPnVbVqVa1Zs+aer82sEydOKC4uLt14QkKCXn75ZYWEhKSb+/DDDzV06FAZhvHQ6vqnpKQkDR8+XH/88ce/nhcXF6e//vrLpKoAAMDDQCgFAAByhJSUFF2+fFnXrl3TjRs3rK+SJUvqv//9b5qxGzduKD4+XgkJCenuERERoUOHDunPP/+0vj755BN99dVXacYOHTqkiIiIf61p3rx58vb21sWLFyVJoaGheu2113TlypV76m3lypXav3+/ypUrd2+flEyKiYnRc889p++//z7dnIuLi65fv66ePXvq6tWr1vG4uDhNnz5dhQsXlsVieSh1ZSQ4OFgff/zxXQO677//Xs8995xiYmJMqgwAADxoFsPMX30BAABk0aFDh1ShQgXZ2dkpT548/3quYRhKTk7W8OHDNXLkSOv41atXVaRIETk7Oytv3rzW8djYWDk4OMjFxcU6divYunbtmhwcHNJ9jOTkZJUrV04NGjTQN998I0m6fv26nnrqKTVv3lzz5s3L8JpDhw7J2dk5zT39/f1VpEgRzZ8/P901qampSkxMVMGCBVWsWLF/7ftOOnfurGLFiunzzz/XjRs3dPr0aTk6OsrR0VEWi0UnT55UcHCw3nnnHeXPn1+S9Pnnnys4OFg7duyQg4ODUlNTlZCQoEKFCsnZ2Vl//PGHXn/9df3yyy+ys7NT+/bt9cUXX1ivl6Tp06drwoQJunbtmnr37q2PP/5YdnZ3/p3ohQsXVKlSJTk7OyspKUm7du1S8eLF73j+m2++qfPnz2dpdRoAALA9QikAAJAjGIahpKQkOTo6av369dq2bZs+/vjjNOds2rRJoaGheuedd+Tq6qqUlJR0gdKtlVS3AhlJatasmerXr28NsFJSUpSQkCAXFxfZ29tnWM9nn32mQYMG6eDBgypTpox1fPny5ercubNWrlyp9u3bp7nm1KlTKlmyZJb6HzFiRJqALbPWr1+vN954Q/v375ejo6OioqLk5eWVpRokacOGDapbt66qVaumPHny6K233tKlS5c0atQodevWzRrGLViwQK+99ppGjBihGjVq6O2331avXr30/vvvZ3jf1NRUtWrVSlFRUdqxY4datmwpi8WidevWydXVNcNrEhMTVbFiRc2ePVtNmzbNck8AAMA2CKUAAECO8/PPP6thw4Zas2aNnnvuOet4586dFRUVpV9//fWOYdL69evVvn17OTk5WVft/HOlVEpKiq5fv67w8HBVrFgx3T2OHj2qKlWqqH///ho/fny6+ZdeekmrV6/W2rVr1aBBA+u4YRhKSEiQo6Oj7O3tlZSUpKpVq6pevXqaP3++lixZorx586pjx4739fm5XfPmzfXaa6+pS5cukm6GcpcuXZK7u7vy5s2b6Ufzrl+/rqtXr8rNzU0LFy7U6NGjtXfvXrm7u0uShgwZolmzZunq1asyDEMlS5aUv7+/Zs6cKelmYNixY0ddvHgxw5Vn/fr108KFCxUaGqqnn35aJ06ckK+vr4oVK6a1a9eqaNGiGda1ZMkSffXVV1q7dm0WPjsAAMCWCKUAAECOEBkZKUnWx+5efvll1atXz7rJ+eXLl1WvXj3NmTNHDRo0UGpqqpKSkuTq6qonnnhC165d09WrV+Xo6JjuEbL27dvL19dXgwcPTjN+69G5YsWKWcObxMRENW7cWLGxsQoPD5eTk1O6Wq9du6Z27dopNDRU06ZNU+/evTPsaeTIkZo1a5YiIyNVqFAhvfjii8qfP3+Gj/FlxeXLl1WmTBmdPXtWzs7OaeYWLlyo7du3p7vG399frVq1SjPWu3dvFSxY0BrARUdHKyYmJs0eWBMnTtRHH32kuLg4HTx4UD4+Pvr5559Vr149STcDOXd3d61bt0516tSxXpeQkKDAwEB99913WrNmjSpVqqTBgwdrwIABcnFxkZ+fn1JSUvTZZ59Zg7XbJSQkqESJEjp27JgKFiyY9U8WAAAwXfpfUwEAAGRDAwYMUHR0dJrVPT///LN+/vln6znVq1fX7NmzNXv2bGug1K5dO40aNUqbNm1Shw4drKuUbomNjZXFYtGePXv0xRdfpPmYqampun79ui5cuCB3d3elpKSoR48e2r9/v+bNm6ejR4/ecUXWxIkTNXz4cPXp00dLly7V8uXL04Qm69at08cff6w5c+bo2rVrOnXqlJKSkpSYmKhTp06luVfx4sXv+HH+TWRkpHWPpn/avn27duzYoTfeeMM6Nnr0aJUuXVqtWrVSoUKFNHHiRL3yyiuKjY1N8/GLFCmiIkWKpLnf2rVrVbduXUnS6dOnJUlVqlSxzlssFnl5eemPP/6whlLff/+93n77bV29elU//vijmjZtqjNnzuirr75SQECAmjdvrrCwML344ovq2rWrpk2bptdff10vvPCCHB0dJd3cqL1ixYo6ePCg9eMDAICcgVAKAADkCOvXr7+v61u3bq2kpCRJNx/Pi4yM1OjRo/Xnn39qxowZeuaZZxQVFaVx48bp2rVrmjNnjgoUKJDmHl999ZVWrFihVatW6c0339TRo0f/9WP++uuv8vPz05EjR9IEUjt27FCXLl2UmpqqwMDAdNctWrQozXFUVJRKlSp1zz2fPXv2jhuFOzs7q1SpUmk+/hdffGFdiebk5GRdBebg4PCvodjWrVu1adMm6yN0165dk729fbrPX/78+RUdHS1J6t69uxYuXKgaNWooJCRE3t7ekmR9hPLWI36lSpXStm3b9Mknn2jixIn64IMP1LZtW2soJd0M7c6cOZP5TwwAAMgW7vz2JwAAANnQ66+/LovFctfX7Suo/qlnz56qUqWKvv32Wx07dkzNmzeXo6OjKlWqJEn64IMPdPHixXTXvfrqqzpw4ICee+457du3Tzdu3NCQIUNUvnx5GYZhfYWGhkqSSpYsqQEDBlj3VZJubhTevHlzNW/eXAULFtR3332n5ORkJScn66WXXlLv3r2tx/v375ekNAHMvbCzs1NqamqGc/b29tq8ebOqVatmfUVGRlofbbz1ebybhIQE9erVSy1btlTLli2t9WYUYlksFl27dk2SNG7cOE2bNk2//vqrNZCS/j+Muv1j58mTRyNHjtTRo0e1fPly6z5Wt6SkpGRpJRkAALAtQikAAJCjODs7q1GjRrpy5UqGr19++UWSMtzr6ZZx48bpjz/+0PHjxzV8+HAVKVJE/fv319GjR/Xll1/q/PnzqlKlijZv3pzu2lsBirOzs+zt7bV//35Vrlw5zTmnT5+Wg4ODihUrlmZ89erVatmypZo2bapFixbJzs5OdnZ2cnBwkIODgzUIunV8K2jJaGPwzLjbCqJnnnlGwcHB1ldW3pXvnXfe0eXLl63vuidJjz32mJKSknThwoU05166dEn58uWTJHl6eurNN9+8pzCpaNGiqlWrVrrxf1sRBgAAsi8e3wMAADnKrRDnn6tlbrn1yNidgpylS5dqy5YtCg8P165du5Samqrq1atr165datmypTXckqQ2bdooJCRE9evXz/BeMTEx2rx5syZMmJBm/PTp0/L09Ey3oXrr1q01d+5cde/ePct
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"分析最新的CTC results: ctc_results_20251009_202457.pkl\n",
|
|||
|
"=== 分析文件: ctc_results_20251009_202457.pkl ===\n",
|
|||
|
"数据类型: <class 'list'>\n",
|
|||
|
"列表长度: 4\n",
|
|||
|
"第一个元素类型: <class 'dict'>\n",
|
|||
|
"第一个元素的键: ['session', 'input_layer', 'trial_idx', 'block_num', 'trial_num', 'corpus', 'original_sequence', 'sentence_label', 'n_time_steps', 'predicted_phonemes', 'ctc_score', 'alignment_info']\n",
|
|||
|
"示例CTC结果结构:\n",
|
|||
|
" session: t15.2023.08.11 (<class 'str'>)\n",
|
|||
|
" input_layer: 0 (<class 'int'>)\n",
|
|||
|
" trial_idx: 0 (<class 'int'>)\n",
|
|||
|
" block_num: 2 (<class 'numpy.int64'>)\n",
|
|||
|
" trial_num: 0 (<class 'numpy.int64'>)\n",
|
|||
|
" corpus: 50-Word (<class 'str'>)\n",
|
|||
|
" original_sequence: <class 'numpy.ndarray'> (500,)\n",
|
|||
|
" sentence_label: Bring it closer. (<class 'str'>)\n",
|
|||
|
" n_time_steps: 321 (<class 'numpy.int64'>)\n",
|
|||
|
" predicted_phonemes: ['B', 'R', 'IH', 'NG', ' | ', 'IH', 'T', ' | ', 'K', 'L', 'OW', 'S', 'ER', ' | '] (<class 'list'>)\n",
|
|||
|
" ctc_score: -409.8164916324466 (<class 'float'>)\n",
|
|||
|
" alignment_info: [('B', 17, 18, np.float64(0.8151801824569702)), ('R', 19, 20, np.float64(0.9981182217597961)), ('IH', 21, 21, np.float64(0.9999998807907104)), ('NG', 22, 23, np.float64(0.9939739406108856)), (' | ', 24, 26, np.float64(0.9843189120292664)), ('IH', 42, 42, np.float64(1.0)), ('T', 43, 43, np.float64(0.9999998807907104)), (' | ', 44, 47, np.float64(0.9870144575834274)), ('K', 61, 62, np.float64(0.9999440908432007)), ('L', 63, 64, np.float64(0.9994930922985077)), ('OW', 65, 65, np.float64(0.9999935626983643)), ('S', 66, 66, np.float64(0.9999934434890747)), ('ER', 69, 70, np.float64(0.9999157190322876)), (' | ', 71, 74, np.float64(0.9988113343715668))] (<class 'list'>)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#!/usr/bin/env python3\n",
|
|||
|
"\"\"\"\n",
|
|||
|
"PKL文件读取和内容展示工具\n",
|
|||
|
"用于查看phoneme_segmented_data目录中的数据结构\n",
|
|||
|
"\"\"\"\n",
|
|||
|
"\n",
|
|||
|
"import pickle\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"from pathlib import Path\n",
|
|||
|
"from collections import defaultdict\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"def load_and_analyze_pkl(pkl_path):\n",
|
|||
|
" \"\"\"加载并分析PKL文件\"\"\"\n",
|
|||
|
" print(f\"=== 分析文件: {pkl_path.name} ===\")\n",
|
|||
|
" \n",
|
|||
|
" with open(pkl_path, 'rb') as f:\n",
|
|||
|
" data = pickle.load(f)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"数据类型: {type(data)}\")\n",
|
|||
|
" \n",
|
|||
|
" if isinstance(data, dict):\n",
|
|||
|
" print(f\"字典键数量: {len(data)}\")\n",
|
|||
|
" print(f\"字典键: {list(data.keys())[:10]}...\") # 显示前10个键\n",
|
|||
|
" \n",
|
|||
|
" # 如果是phoneme dataset\n",
|
|||
|
" if all(isinstance(v, list) for v in data.values()):\n",
|
|||
|
" total_segments = sum(len(v) for v in data.values())\n",
|
|||
|
" print(f\"总segment数量: {total_segments}\")\n",
|
|||
|
" \n",
|
|||
|
" # 显示每个phoneme的segment数量\n",
|
|||
|
" phoneme_counts = {k: len(v) for k, v in data.items()}\n",
|
|||
|
" sorted_phonemes = sorted(phoneme_counts.items(), key=lambda x: x[1], reverse=True)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n音素分布 (前20个):\")\n",
|
|||
|
" for phoneme, count in sorted_phonemes[:20]:\n",
|
|||
|
" print(f\" '{phoneme}': {count:4d} segments\")\n",
|
|||
|
" \n",
|
|||
|
" return data, phoneme_counts\n",
|
|||
|
" \n",
|
|||
|
" elif isinstance(data, list):\n",
|
|||
|
" print(f\"列表长度: {len(data)}\")\n",
|
|||
|
" if len(data) > 0:\n",
|
|||
|
" print(f\"第一个元素类型: {type(data[0])}\")\n",
|
|||
|
" if isinstance(data[0], dict):\n",
|
|||
|
" print(f\"第一个元素的键: {list(data[0].keys())}\")\n",
|
|||
|
" return data, None\n",
|
|||
|
" \n",
|
|||
|
" return data, None\n",
|
|||
|
"\n",
|
|||
|
"def analyze_segment_structure(phoneme_data, sample_count=5):\n",
|
|||
|
" \"\"\"分析segment结构\"\"\"\n",
|
|||
|
" print(f\"\\n=== Segment结构分析 ===\")\n",
|
|||
|
" \n",
|
|||
|
" # 获取一些sample segments\n",
|
|||
|
" sample_segments = []\n",
|
|||
|
" for phoneme, segments in list(phoneme_data.items())[:3]: # 取前3个phoneme\n",
|
|||
|
" sample_segments.extend(segments[:sample_count]) # 每个phoneme取前5个\n",
|
|||
|
" \n",
|
|||
|
" if sample_segments:\n",
|
|||
|
" print(f\"示例segment结构:\")\n",
|
|||
|
" segment = sample_segments[0]\n",
|
|||
|
" for key, value in segment.items():\n",
|
|||
|
" print(f\" {key}: {value} ({type(value)})\")\n",
|
|||
|
" \n",
|
|||
|
" # 统计不同字段的分布\n",
|
|||
|
" sessions = set()\n",
|
|||
|
" corpora = set()\n",
|
|||
|
" durations = []\n",
|
|||
|
" confidences = []\n",
|
|||
|
" \n",
|
|||
|
" for segment in sample_segments:\n",
|
|||
|
" sessions.add(segment.get('session', 'unknown'))\n",
|
|||
|
" corpora.add(segment.get('corpus', 'unknown'))\n",
|
|||
|
" \n",
|
|||
|
" start = segment.get('start_time', 0)\n",
|
|||
|
" end = segment.get('end_time', 0)\n",
|
|||
|
" if end >= start:\n",
|
|||
|
" durations.append(end - start + 1)\n",
|
|||
|
" \n",
|
|||
|
" conf = segment.get('confidence', 0)\n",
|
|||
|
" if conf > 0:\n",
|
|||
|
" confidences.append(conf)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n字段统计:\")\n",
|
|||
|
" print(f\" 不同session数: {len(sessions)}\")\n",
|
|||
|
" print(f\" 不同corpus数: {len(corpora)}\")\n",
|
|||
|
" print(f\" 平均持续时间: {np.mean(durations):.1f} time steps\")\n",
|
|||
|
" print(f\" 平均置信度: {np.mean(confidences):.3f}\")\n",
|
|||
|
"\n",
|
|||
|
"def analyze_session_distribution(phoneme_data):\n",
|
|||
|
" \"\"\"分析session分布\"\"\"\n",
|
|||
|
" print(f\"\\n=== Session分布分析 ===\")\n",
|
|||
|
" \n",
|
|||
|
" session_stats = defaultdict(lambda: {'segments': 0, 'trials': set(), 'phonemes': set()})\n",
|
|||
|
" \n",
|
|||
|
" for phoneme, segments in phoneme_data.items():\n",
|
|||
|
" for segment in segments:\n",
|
|||
|
" session = segment.get('session', 'unknown')\n",
|
|||
|
" trial_num = segment.get('trial_num', -1)\n",
|
|||
|
" \n",
|
|||
|
" session_stats[session]['segments'] += 1\n",
|
|||
|
" session_stats[session]['trials'].add(trial_num)\n",
|
|||
|
" session_stats[session]['phonemes'].add(phoneme)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"Session统计:\")\n",
|
|||
|
" for session in sorted(session_stats.keys()):\n",
|
|||
|
" stats = session_stats[session]\n",
|
|||
|
" print(f\" {session}:\")\n",
|
|||
|
" print(f\" Segments: {stats['segments']}\")\n",
|
|||
|
" print(f\" Trials: {len(stats['trials'])}\")\n",
|
|||
|
" print(f\" Phonemes: {len(stats['phonemes'])}\")\n",
|
|||
|
"\n",
|
|||
|
"def plot_phoneme_distribution(phoneme_counts):\n",
|
|||
|
" \"\"\"绘制音素分布图\"\"\"\n",
|
|||
|
" # 排序并取前20个\n",
|
|||
|
" sorted_phonemes = sorted(phoneme_counts.items(), key=lambda x: x[1], reverse=True)[:20]\n",
|
|||
|
" phonemes, counts = zip(*sorted_phonemes)\n",
|
|||
|
" \n",
|
|||
|
" plt.figure(figsize=(12, 6))\n",
|
|||
|
" bars = plt.bar(range(len(phonemes)), counts)\n",
|
|||
|
" plt.xlabel('音素')\n",
|
|||
|
" plt.ylabel('Segment数量')\n",
|
|||
|
" plt.title('音素分布 (前20个)')\n",
|
|||
|
" plt.xticks(range(len(phonemes)), phonemes, rotation=45)\n",
|
|||
|
" \n",
|
|||
|
" # 在条形图上显示数值\n",
|
|||
|
" for i, bar in enumerate(bars):\n",
|
|||
|
" height = bar.get_height()\n",
|
|||
|
" plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,\n",
|
|||
|
" f'{int(height)}', ha='center', va='bottom', fontsize=8)\n",
|
|||
|
" \n",
|
|||
|
" plt.tight_layout()\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# 主要调用函数\n",
|
|||
|
"def analyze_pkl_files():\n",
|
|||
|
" \"\"\"分析phoneme_segmented_data目录中的所有PKL文件\"\"\"\n",
|
|||
|
" data_dir = Path(\"../phoneme_segmented_data\")\n",
|
|||
|
" \n",
|
|||
|
" if not data_dir.exists():\n",
|
|||
|
" print(f\"目录不存在: {data_dir}\")\n",
|
|||
|
" return\n",
|
|||
|
" \n",
|
|||
|
" # 查找所有PKL文件\n",
|
|||
|
" pkl_files = list(data_dir.glob(\"*.pkl\"))\n",
|
|||
|
" print(f\"找到 {len(pkl_files)} 个PKL文件:\")\n",
|
|||
|
" \n",
|
|||
|
" for pkl_file in pkl_files:\n",
|
|||
|
" size_mb = pkl_file.stat().st_size / 1024 / 1024\n",
|
|||
|
" print(f\" {pkl_file.name} ({size_mb:.1f} MB)\")\n",
|
|||
|
" \n",
|
|||
|
" # 分析最新的phoneme dataset\n",
|
|||
|
" phoneme_files = [f for f in pkl_files if f.name.startswith(\"phoneme_dataset_\")]\n",
|
|||
|
" if phoneme_files:\n",
|
|||
|
" latest_phoneme_file = max(phoneme_files, key=lambda x: x.stat().st_mtime)\n",
|
|||
|
" print(f\"\\n分析最新的phoneme dataset: {latest_phoneme_file.name}\")\n",
|
|||
|
" \n",
|
|||
|
" phoneme_data, phoneme_counts = load_and_analyze_pkl(latest_phoneme_file)\n",
|
|||
|
" \n",
|
|||
|
" if phoneme_data and phoneme_counts:\n",
|
|||
|
" analyze_segment_structure(phoneme_data)\n",
|
|||
|
" analyze_session_distribution(phoneme_data)\n",
|
|||
|
" plot_phoneme_distribution(phoneme_counts)\n",
|
|||
|
" \n",
|
|||
|
" # 分析最新的CTC results\n",
|
|||
|
" ctc_files = [f for f in pkl_files if f.name.startswith(\"ctc_results_\")]\n",
|
|||
|
" if ctc_files:\n",
|
|||
|
" latest_ctc_file = max(ctc_files, key=lambda x: x.stat().st_mtime)\n",
|
|||
|
" print(f\"\\n分析最新的CTC results: {latest_ctc_file.name}\")\n",
|
|||
|
" \n",
|
|||
|
" ctc_data, _ = load_and_analyze_pkl(latest_ctc_file)\n",
|
|||
|
" \n",
|
|||
|
" if isinstance(ctc_data, list) and len(ctc_data) > 0:\n",
|
|||
|
" print(f\"示例CTC结果结构:\")\n",
|
|||
|
" sample = ctc_data[0]\n",
|
|||
|
" for key, value in sample.items():\n",
|
|||
|
" if isinstance(value, np.ndarray):\n",
|
|||
|
" print(f\" {key}: {type(value)} {value.shape}\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(f\" {key}: {value} ({type(value)})\")\n",
|
|||
|
"\n",
|
|||
|
"# 运行分析\n",
|
|||
|
"analyze_pkl_files()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "hw37welyuqr",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"数据目录: f:\\BRAIN-TO-TEXT\\nejm-brain-to-text.worktrees\\dev2\\data_analyse\\..\\phoneme_segmented_data\n",
|
|||
|
"目录是否存在: True\n",
|
|||
|
"\n",
|
|||
|
"=== 发现的PKL文件 ===\n",
|
|||
|
"ctc_results_20251008_235600.pkl: 25.6 MB\n",
|
|||
|
"ctc_results_20251009_202457.pkl: 23.9 MB\n",
|
|||
|
"phoneme_dataset_20251008_235600.pkl: 21.1 MB\n",
|
|||
|
"phoneme_dataset_20251009_202457.pkl: 19.1 MB\n",
|
|||
|
"\n",
|
|||
|
"=== 加载最新的Phoneme Dataset ===\n",
|
|||
|
"文件: phoneme_dataset_20251009_202457.pkl\n",
|
|||
|
"数据类型: <class 'dict'>\n",
|
|||
|
"音素数量: 40\n",
|
|||
|
"总segment数量: 214218\n",
|
|||
|
"\n",
|
|||
|
"音素分布统计 (前15个):\n",
|
|||
|
" 1. ' | ': 50607 segments (23.6%)\n",
|
|||
|
" 2. 'T': 13049 segments (6.1%)\n",
|
|||
|
" 3. 'AH': 12663 segments (5.9%)\n",
|
|||
|
" 4. 'IH': 10561 segments (4.9%)\n",
|
|||
|
" 5. 'N': 9368 segments (4.4%)\n",
|
|||
|
" 6. 'S': 7054 segments (3.3%)\n",
|
|||
|
" 7. 'R': 6865 segments (3.2%)\n",
|
|||
|
" 8. 'L': 6429 segments (3.0%)\n",
|
|||
|
" 9. 'IY': 6278 segments (2.9%)\n",
|
|||
|
"10. 'D': 6260 segments (2.9%)\n",
|
|||
|
"11. 'AY': 5337 segments (2.5%)\n",
|
|||
|
"12. 'DH': 5259 segments (2.5%)\n",
|
|||
|
"13. 'K': 5178 segments (2.4%)\n",
|
|||
|
"14. 'M': 5125 segments (2.4%)\n",
|
|||
|
"15. 'UW': 4802 segments (2.2%)\n",
|
|||
|
"\n",
|
|||
|
"=== Segment结构分析 ===\n",
|
|||
|
"Segment字段数量: 8\n",
|
|||
|
"Segment字段: ['phoneme', 'start_time', 'end_time', 'confidence', 'session', 'block_num', 'trial_num', 'corpus']\n",
|
|||
|
"\n",
|
|||
|
"第一个segment详细信息:\n",
|
|||
|
" phoneme: B (<class 'str'>)\n",
|
|||
|
" start_time: 17 (<class 'int'>)\n",
|
|||
|
" end_time: 18 (<class 'int'>)\n",
|
|||
|
" confidence: 0.8151801824569702 (<class 'numpy.float64'>)\n",
|
|||
|
" session: t15.2023.08.11 (<class 'str'>)\n",
|
|||
|
" block_num: 2 (<class 'numpy.int64'>)\n",
|
|||
|
" trial_num: 0 (<class 'numpy.int64'>)\n",
|
|||
|
" corpus: 50-Word (<class 'str'>)\n",
|
|||
|
"\n",
|
|||
|
"=== 字段统计分析 ===\n",
|
|||
|
"start_time: 范围 [17, 195], 平均 56.3\n",
|
|||
|
"end_time: 范围 [18, 195], 平均 57.3\n",
|
|||
|
"confidence: 范围 [0.815, 1.000], 平均 0.975\n",
|
|||
|
"session: 1 个不同session - ['t15.2023.08.11']\n",
|
|||
|
"trial_num: 6 个不同trial - 范围 0 到 7\n",
|
|||
|
"corpus: 1 个不同corpus - ['50-Word']\n",
|
|||
|
"\n",
|
|||
|
"=== Session分布分析 ===\n",
|
|||
|
"总session数: 45\n",
|
|||
|
"t15.2023.08.11:\n",
|
|||
|
" Segments: 4771\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 34\n",
|
|||
|
" Corpus: ['50-Word']\n",
|
|||
|
"t15.2023.08.13:\n",
|
|||
|
" Segments: 8607\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard', '50-Word']\n",
|
|||
|
"t15.2023.08.18:\n",
|
|||
|
" Segments: 5281\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.08.20:\n",
|
|||
|
" Segments: 7333\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.08.25:\n",
|
|||
|
" Segments: 2319\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.08.27:\n",
|
|||
|
" Segments: 4064\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.09.01:\n",
|
|||
|
" Segments: 7875\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.09.03:\n",
|
|||
|
" Segments: 8962\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.09.24:\n",
|
|||
|
" Segments: 6282\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard', '50-Word']\n",
|
|||
|
"t15.2023.09.29:\n",
|
|||
|
" Segments: 4035\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.01:\n",
|
|||
|
" Segments: 5869\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.06:\n",
|
|||
|
" Segments: 4510\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard', '50-Word']\n",
|
|||
|
"t15.2023.10.08:\n",
|
|||
|
" Segments: 7583\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.13:\n",
|
|||
|
" Segments: 3903\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.15:\n",
|
|||
|
" Segments: 6246\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.20:\n",
|
|||
|
" Segments: 2584\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2023.10.22:\n",
|
|||
|
" Segments: 4172\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words', 'Switchboard']\n",
|
|||
|
"t15.2023.11.03:\n",
|
|||
|
" Segments: 3734\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.11.04:\n",
|
|||
|
" Segments: 1732\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.11.17:\n",
|
|||
|
" Segments: 2443\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.11.19:\n",
|
|||
|
" Segments: 1358\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.11.26:\n",
|
|||
|
" Segments: 5678\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.12.03:\n",
|
|||
|
" Segments: 6631\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.12.08:\n",
|
|||
|
" Segments: 6024\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.12.10:\n",
|
|||
|
" Segments: 3732\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.12.17:\n",
|
|||
|
" Segments: 3976\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2023.12.29:\n",
|
|||
|
" Segments: 5764\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Freq words']\n",
|
|||
|
"t15.2024.02.25:\n",
|
|||
|
" Segments: 5424\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.03.03:\n",
|
|||
|
" Segments: 4121\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard', '50-Word']\n",
|
|||
|
"t15.2024.03.08:\n",
|
|||
|
" Segments: 5090\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.03.15:\n",
|
|||
|
" Segments: 7502\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.03.17:\n",
|
|||
|
" Segments: 7940\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.04.25:\n",
|
|||
|
" Segments: 6605\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 39\n",
|
|||
|
" Corpus: ['Switchboard', '50-Word']\n",
|
|||
|
"t15.2024.04.28:\n",
|
|||
|
" Segments: 2575\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 34\n",
|
|||
|
" Corpus: ['50-Word']\n",
|
|||
|
"t15.2024.05.10:\n",
|
|||
|
" Segments: 3012\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.06.14:\n",
|
|||
|
" Segments: 2524\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.07.19:\n",
|
|||
|
" Segments: 5249\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.07.21:\n",
|
|||
|
" Segments: 4687\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2024.07.28:\n",
|
|||
|
" Segments: 4603\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.01.10:\n",
|
|||
|
" Segments: 3078\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.01.12:\n",
|
|||
|
" Segments: 4819\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.03.14:\n",
|
|||
|
" Segments: 1811\n",
|
|||
|
" Trials: 25\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.03.16:\n",
|
|||
|
" Segments: 2812\n",
|
|||
|
" Trials: 49\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.03.30:\n",
|
|||
|
" Segments: 4856\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"t15.2025.04.13:\n",
|
|||
|
" Segments: 2042\n",
|
|||
|
" Trials: 50\n",
|
|||
|
" Phonemes: 40\n",
|
|||
|
" Corpus: ['Switchboard']\n",
|
|||
|
"\n",
|
|||
|
"=== 加载最新的CTC Results ===\n",
|
|||
|
"文件: ctc_results_20251009_202457.pkl\n",
|
|||
|
"数据类型: <class 'list'>\n",
|
|||
|
"条目数量: 4\n",
|
|||
|
"\n",
|
|||
|
"第一个CTC结果结构:\n",
|
|||
|
" session: t15.2023.08.11 (<class 'str'>)\n",
|
|||
|
" input_layer: 0 (<class 'int'>)\n",
|
|||
|
" trial_idx: 0 (<class 'int'>)\n",
|
|||
|
" block_num: 2 (<class 'numpy.int64'>)\n",
|
|||
|
" trial_num: 0 (<class 'numpy.int64'>)\n",
|
|||
|
" corpus: 50-Word (<class 'str'>)\n",
|
|||
|
" original_sequence: <class 'numpy.ndarray'> shape=(500,) dtype=int32\n",
|
|||
|
" sentence_label: Bring it closer. (<class 'str'>)\n",
|
|||
|
" n_time_steps: 321 (<class 'numpy.int64'>)\n",
|
|||
|
" predicted_phonemes: <class 'list'> length=14\n",
|
|||
|
" 第一个元素: B (<class 'str'>)\n",
|
|||
|
" ctc_score: -409.8164916324466 (<class 'float'>)\n",
|
|||
|
" alignment_info: <class 'list'> length=14\n",
|
|||
|
" 第一个元素: ('B', 17, 18, np.float64(0.8151801824569702)) (<class 'tuple'>)\n",
|
|||
|
"\n",
|
|||
|
"前5个CTC结果的共同字段: ['original_sequence', 'trial_idx', 'sentence_label', 'block_num', 'predicted_phonemes', 'corpus', 'input_layer', 'n_time_steps', 'alignment_info', 'session', 'trial_num', 'ctc_score']\n",
|
|||
|
"\n",
|
|||
|
"数组字段形状分布:\n",
|
|||
|
" original_sequence: 1 种不同形状\n",
|
|||
|
" (500,): 4 次\n",
|
|||
|
"\n",
|
|||
|
"=== 变量总结 ===\n",
|
|||
|
"以下变量已创建,可供后续使用:\n",
|
|||
|
"- pkl_files_info: 所有pkl文件信息列表\n",
|
|||
|
"- phoneme_data: 音素数据字典\n",
|
|||
|
"- phoneme_counts: 各音素的segment数量统计\n",
|
|||
|
"- sorted_phonemes: 按数量排序的音素列表\n",
|
|||
|
"- sample_segments: 示例segment列表\n",
|
|||
|
"- session_summary: session统计信息字典\n",
|
|||
|
"- ctc_data: CTC结果数据列表\n",
|
|||
|
"\n",
|
|||
|
"可以直接使用这些变量进行进一步分析!\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 数据结构展示代码 - 保留所有变量供后续调用\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"import pickle\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"from pathlib import Path\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from collections import defaultdict, Counter\n",
|
|||
|
"\n",
|
|||
|
"# 设置数据目录\n",
|
|||
|
"data_dir = Path(\"../phoneme_segmented_data\")\n",
|
|||
|
"print(f\"数据目录: {data_dir.absolute()}\")\n",
|
|||
|
"print(f\"目录是否存在: {data_dir.exists()}\")\n",
|
|||
|
"\n",
|
|||
|
"# 获取所有pkl文件信息\n",
|
|||
|
"pkl_files = list(data_dir.glob(\"*.pkl\")) if data_dir.exists() else []\n",
|
|||
|
"pkl_files_info = []\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n=== 发现的PKL文件 ===\")\n",
|
|||
|
"for pkl_file in pkl_files:\n",
|
|||
|
" size_mb = pkl_file.stat().st_size / 1024 / 1024\n",
|
|||
|
" mtime = pkl_file.stat().st_mtime\n",
|
|||
|
" pkl_files_info.append({\n",
|
|||
|
" 'path': pkl_file,\n",
|
|||
|
" 'name': pkl_file.name,\n",
|
|||
|
" 'size_mb': size_mb,\n",
|
|||
|
" 'mtime': mtime\n",
|
|||
|
" })\n",
|
|||
|
" print(f\"{pkl_file.name}: {size_mb:.1f} MB\")\n",
|
|||
|
"\n",
|
|||
|
"# 按修改时间排序,获取最新文件\n",
|
|||
|
"pkl_files_info.sort(key=lambda x: x['mtime'], reverse=True)\n",
|
|||
|
"\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 加载和分析 Phoneme Dataset\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"phoneme_files = [f for f in pkl_files_info if f['name'].startswith(\"phoneme_dataset_\")]\n",
|
|||
|
"if phoneme_files:\n",
|
|||
|
" latest_phoneme_file = phoneme_files[0]['path']\n",
|
|||
|
" print(f\"\\n=== 加载最新的Phoneme Dataset ===\")\n",
|
|||
|
" print(f\"文件: {latest_phoneme_file.name}\")\n",
|
|||
|
" \n",
|
|||
|
" # 直接加载数据,不使用函数封装\n",
|
|||
|
" with open(latest_phoneme_file, 'rb') as f:\n",
|
|||
|
" phoneme_data = pickle.load(f)\n",
|
|||
|
" \n",
|
|||
|
" # 基本信息\n",
|
|||
|
" phoneme_data_type = type(phoneme_data)\n",
|
|||
|
" phoneme_keys = list(phoneme_data.keys()) if isinstance(phoneme_data, dict) else None\n",
|
|||
|
" phoneme_keys_count = len(phoneme_keys) if phoneme_keys else 0\n",
|
|||
|
" \n",
|
|||
|
" print(f\"数据类型: {phoneme_data_type}\")\n",
|
|||
|
" print(f\"音素数量: {phoneme_keys_count}\")\n",
|
|||
|
" \n",
|
|||
|
" if isinstance(phoneme_data, dict):\n",
|
|||
|
" # 统计每个音素的segment数量\n",
|
|||
|
" phoneme_counts = {k: len(v) for k, v in phoneme_data.items()}\n",
|
|||
|
" total_segments = sum(phoneme_counts.values())\n",
|
|||
|
" \n",
|
|||
|
" print(f\"总segment数量: {total_segments}\")\n",
|
|||
|
" \n",
|
|||
|
" # 按数量排序的音素列表\n",
|
|||
|
" sorted_phonemes = sorted(phoneme_counts.items(), key=lambda x: x[1], reverse=True)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n音素分布统计 (前15个):\")\n",
|
|||
|
" for i, (phoneme, count) in enumerate(sorted_phonemes[:15]):\n",
|
|||
|
" print(f\"{i+1:2d}. '{phoneme}': {count:4d} segments ({count/total_segments*100:.1f}%)\")\n",
|
|||
|
" \n",
|
|||
|
" # 获取示例segments进行结构分析\n",
|
|||
|
" sample_segments = []\n",
|
|||
|
" sample_phonemes = []\n",
|
|||
|
" \n",
|
|||
|
" for phoneme, segments in list(phoneme_data.items())[:5]: # 取前5个音素\n",
|
|||
|
" for segment in segments[:3]: # 每个音素取前3个segment\n",
|
|||
|
" sample_segments.append(segment)\n",
|
|||
|
" sample_phonemes.append(phoneme)\n",
|
|||
|
" \n",
|
|||
|
" # 分析segment结构\n",
|
|||
|
" if sample_segments:\n",
|
|||
|
" first_segment = sample_segments[0]\n",
|
|||
|
" segment_keys = list(first_segment.keys())\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n=== Segment结构分析 ===\")\n",
|
|||
|
" print(f\"Segment字段数量: {len(segment_keys)}\")\n",
|
|||
|
" print(f\"Segment字段: {segment_keys}\")\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n第一个segment详细信息:\")\n",
|
|||
|
" for key, value in first_segment.items():\n",
|
|||
|
" value_type = type(value)\n",
|
|||
|
" if isinstance(value, np.ndarray):\n",
|
|||
|
" print(f\" {key}: {value_type} shape={value.shape} dtype={value.dtype}\")\n",
|
|||
|
" if value.size < 10: # 小数组显示具体值\n",
|
|||
|
" print(f\" 值: {value}\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(f\" 范围: [{value.min():.3f}, {value.max():.3f}]\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(f\" {key}: {value} ({value_type})\")\n",
|
|||
|
" \n",
|
|||
|
" # 统计字段分布\n",
|
|||
|
" field_stats = defaultdict(list)\n",
|
|||
|
" for segment in sample_segments:\n",
|
|||
|
" for key, value in segment.items():\n",
|
|||
|
" field_stats[key].append(value)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n=== 字段统计分析 ===\")\n",
|
|||
|
" for key, values in field_stats.items():\n",
|
|||
|
" if key == 'session':\n",
|
|||
|
" unique_sessions = list(set(values))\n",
|
|||
|
" print(f\"{key}: {len(unique_sessions)} 个不同session - {unique_sessions}\")\n",
|
|||
|
" elif key == 'corpus':\n",
|
|||
|
" unique_corpus = list(set(values))\n",
|
|||
|
" print(f\"{key}: {len(unique_corpus)} 个不同corpus - {unique_corpus}\")\n",
|
|||
|
" elif key in ['start_time', 'end_time']:\n",
|
|||
|
" times = [v for v in values if v is not None]\n",
|
|||
|
" if times:\n",
|
|||
|
" print(f\"{key}: 范围 [{min(times)}, {max(times)}], 平均 {np.mean(times):.1f}\")\n",
|
|||
|
" elif key == 'confidence':\n",
|
|||
|
" confs = [v for v in values if v is not None and v > 0]\n",
|
|||
|
" if confs:\n",
|
|||
|
" print(f\"{key}: 范围 [{min(confs):.3f}, {max(confs):.3f}], 平均 {np.mean(confs):.3f}\")\n",
|
|||
|
" elif key == 'trial_num':\n",
|
|||
|
" trials = list(set(values))\n",
|
|||
|
" print(f\"{key}: {len(trials)} 个不同trial - 范围 {min(trials)} 到 {max(trials)}\")\n",
|
|||
|
" \n",
|
|||
|
" # Session分布分析\n",
|
|||
|
" session_stats = defaultdict(lambda: {'segments': 0, 'trials': set(), 'phonemes': set(), 'corpus': set()})\n",
|
|||
|
" \n",
|
|||
|
" for phoneme, segments in phoneme_data.items():\n",
|
|||
|
" for segment in segments:\n",
|
|||
|
" session = segment.get('session', 'unknown')\n",
|
|||
|
" trial_num = segment.get('trial_num', -1)\n",
|
|||
|
" corpus = segment.get('corpus', 'unknown')\n",
|
|||
|
" \n",
|
|||
|
" session_stats[session]['segments'] += 1\n",
|
|||
|
" session_stats[session]['trials'].add(trial_num)\n",
|
|||
|
" session_stats[session]['phonemes'].add(phoneme)\n",
|
|||
|
" session_stats[session]['corpus'].add(corpus)\n",
|
|||
|
" \n",
|
|||
|
" # 转换为更易于访问的格式\n",
|
|||
|
" session_summary = {}\n",
|
|||
|
" for session, stats in session_stats.items():\n",
|
|||
|
" session_summary[session] = {\n",
|
|||
|
" 'segments': stats['segments'],\n",
|
|||
|
" 'trials_count': len(stats['trials']),\n",
|
|||
|
" 'phonemes_count': len(stats['phonemes']),\n",
|
|||
|
" 'corpus_list': list(stats['corpus'])\n",
|
|||
|
" }\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n=== Session分布分析 ===\")\n",
|
|||
|
" print(f\"总session数: {len(session_summary)}\")\n",
|
|||
|
" for session in sorted(session_summary.keys()):\n",
|
|||
|
" stats = session_summary[session]\n",
|
|||
|
" print(f\"{session}:\")\n",
|
|||
|
" print(f\" Segments: {stats['segments']:4d}\")\n",
|
|||
|
" print(f\" Trials: {stats['trials_count']:3d}\")\n",
|
|||
|
" print(f\" Phonemes: {stats['phonemes_count']:2d}\")\n",
|
|||
|
" print(f\" Corpus: {stats['corpus_list']}\")\n",
|
|||
|
"\n",
|
|||
|
"else:\n",
|
|||
|
" print(\"未找到phoneme dataset文件\")\n",
|
|||
|
" phoneme_data = None\n",
|
|||
|
" phoneme_counts = None\n",
|
|||
|
" sorted_phonemes = None\n",
|
|||
|
" sample_segments = None\n",
|
|||
|
"\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"# 加载和分析 CTC Results\n",
|
|||
|
"# =============================================================================\n",
|
|||
|
"\n",
|
|||
|
"ctc_files = [f for f in pkl_files_info if f['name'].startswith(\"ctc_results_\")]\n",
|
|||
|
"if ctc_files:\n",
|
|||
|
" latest_ctc_file = ctc_files[0]['path']\n",
|
|||
|
" print(f\"\\n=== 加载最新的CTC Results ===\")\n",
|
|||
|
" print(f\"文件: {latest_ctc_file.name}\")\n",
|
|||
|
" \n",
|
|||
|
" # 直接加载数据\n",
|
|||
|
" with open(latest_ctc_file, 'rb') as f:\n",
|
|||
|
" ctc_data = pickle.load(f)\n",
|
|||
|
" \n",
|
|||
|
" ctc_data_type = type(ctc_data)\n",
|
|||
|
" ctc_length = len(ctc_data) if hasattr(ctc_data, '__len__') else 0\n",
|
|||
|
" \n",
|
|||
|
" print(f\"数据类型: {ctc_data_type}\")\n",
|
|||
|
" print(f\"条目数量: {ctc_length}\")\n",
|
|||
|
" \n",
|
|||
|
" if isinstance(ctc_data, list) and len(ctc_data) > 0:\n",
|
|||
|
" first_ctc = ctc_data[0]\n",
|
|||
|
" ctc_keys = list(first_ctc.keys()) if isinstance(first_ctc, dict) else None\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n第一个CTC结果结构:\")\n",
|
|||
|
" if ctc_keys:\n",
|
|||
|
" for key, value in first_ctc.items():\n",
|
|||
|
" value_type = type(value)\n",
|
|||
|
" if isinstance(value, np.ndarray):\n",
|
|||
|
" print(f\" {key}: {value_type} shape={value.shape} dtype={value.dtype}\")\n",
|
|||
|
" elif isinstance(value, list):\n",
|
|||
|
" print(f\" {key}: {value_type} length={len(value)}\")\n",
|
|||
|
" if len(value) > 0:\n",
|
|||
|
" print(f\" 第一个元素: {value[0]} ({type(value[0])})\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(f\" {key}: {value} ({value_type})\")\n",
|
|||
|
" \n",
|
|||
|
" # 分析前几个CTC结果的共同字段\n",
|
|||
|
" if len(ctc_data) > 1:\n",
|
|||
|
" common_keys = set(ctc_data[0].keys())\n",
|
|||
|
" for i in range(1, min(5, len(ctc_data))):\n",
|
|||
|
" common_keys &= set(ctc_data[i].keys())\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n前5个CTC结果的共同字段: {list(common_keys)}\")\n",
|
|||
|
" \n",
|
|||
|
" # 统计数组形状分布\n",
|
|||
|
" shape_stats = defaultdict(list)\n",
|
|||
|
" for i in range(min(10, len(ctc_data))):\n",
|
|||
|
" for key, value in ctc_data[i].items():\n",
|
|||
|
" if isinstance(value, np.ndarray):\n",
|
|||
|
" shape_stats[key].append(value.shape)\n",
|
|||
|
" \n",
|
|||
|
" print(f\"\\n数组字段形状分布:\")\n",
|
|||
|
" for key, shapes in shape_stats.items():\n",
|
|||
|
" unique_shapes = list(set(shapes))\n",
|
|||
|
" print(f\" {key}: {len(unique_shapes)} 种不同形状\")\n",
|
|||
|
" for shape in unique_shapes[:5]: # 显示前5种形状\n",
|
|||
|
" count = shapes.count(shape)\n",
|
|||
|
" print(f\" {shape}: {count} 次\")\n",
|
|||
|
"\n",
|
|||
|
"else:\n",
|
|||
|
" print(\"未找到CTC results文件\")\n",
|
|||
|
" ctc_data = None\n",
|
|||
|
"\n",
|
|||
|
"print(f\"\\n=== 变量总结 ===\")\n",
|
|||
|
"print(\"以下变量已创建,可供后续使用:\")\n",
|
|||
|
"print(\"- pkl_files_info: 所有pkl文件信息列表\")\n",
|
|||
|
"print(\"- phoneme_data: 音素数据字典\")\n",
|
|||
|
"print(\"- phoneme_counts: 各音素的segment数量统计\")\n",
|
|||
|
"print(\"- sorted_phonemes: 按数量排序的音素列表\")\n",
|
|||
|
"print(\"- sample_segments: 示例segment列表\")\n",
|
|||
|
"print(\"- session_summary: session统计信息字典\")\n",
|
|||
|
"print(\"- ctc_data: CTC结果数据列表\")\n",
|
|||
|
"print(\"\\n可以直接使用这些变量进行进一步分析!\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"language_info": {
|
|||
|
"name": "python"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|