Files
b2txt25/data_analyse/check_converted_data_structure.py
2025-10-12 09:11:32 +08:00

56 lines
2.0 KiB
Python

#!/usr/bin/env python3
"""
检查转换后的音素数据集的结构
"""
import pickle
def check_data_structure(pkl_path):
"""检查PKL文件的数据结构"""
with open(pkl_path, 'rb') as f:
data = pickle.load(f)
print("=== 数据结构分析 ===")
print(f"数据类型: {type(data)}")
print(f"顶层键: {list(data.keys())}")
# 检查phoneme_data
if 'phoneme_data' in data:
phoneme_data = data['phoneme_data']
print(f"\nphoneme_data类型: {type(phoneme_data)}")
print(f"音素数量: {len(phoneme_data)}")
print(f"音素列表: {list(phoneme_data.keys())[:10]}...")
# 检查第一个音素的结构
first_phoneme = list(phoneme_data.keys())[0]
segments = phoneme_data[first_phoneme]
print(f"\n第一个音素 '{first_phoneme}':")
print(f" segments类型: {type(segments)}")
print(f" segments数量: {len(segments)}")
if len(segments) > 0:
first_segment = segments[0]
print(f" 第一个segment类型: {type(first_segment)}")
print(f" 第一个segment键: {list(first_segment.keys())}")
# 显示segment的详细内容
print(f" 第一个segment内容:")
for key, value in first_segment.items():
if key == 'original_timestamps':
print(f" {key}: {type(value)}")
if isinstance(value, dict):
for ts_key, ts_value in value.items():
print(f" {ts_key}: {ts_value}")
else:
print(f" {key}: {value}")
# 检查conversion_info
if 'conversion_info' in data:
conversion_info = data['conversion_info']
print(f"\nconversion_info:")
for key, value in conversion_info.items():
print(f" {key}: {value}")
if __name__ == "__main__":
pkl_path = "../phoneme_segmented_data/phoneme_dataset_20251009_202457_with_original_timestamps.pkl"
check_data_structure(pkl_path)