Files
b2txt25/data_analyse/safe_inspect.py

87 lines
3.2 KiB
Python
Raw Normal View History

2025-10-12 09:11:32 +08:00
#!/usr/bin/env python3
"""
安全检查音素数据集文件
处理大文件和可能的损坏问题
"""
import pickle
import os
def safe_inspect_phoneme_dataset(file_path):
"""安全地检查音素数据集文件"""
print(f"检查文件: {file_path}")
if not os.path.exists(file_path):
print("文件不存在!")
return
# 文件基本信息
file_size = os.path.getsize(file_path)
print(f"文件大小: {file_size / (1024*1024*1024):.2f} GB")
try:
# 尝试逐步读取
with open(file_path, 'rb') as f:
print("开始读取pickle文件...")
# 尝试读取文件头部
try:
data = pickle.load(f)
print("文件读取成功!")
print(f"数据类型: {type(data)}")
if isinstance(data, dict):
print(f"\n=== 数据集统计 ===")
print(f"音素类型数量: {len(data)}")
# 统计总片段数(不加载所有数据到内存)
total_segments = 0
phoneme_counts = {}
for phoneme, segments in data.items():
segment_count = len(segments) if segments else 0
phoneme_counts[phoneme] = segment_count
total_segments += segment_count
# 只显示前20个音素避免输出过长
if len(phoneme_counts) <= 20:
print(f" {phoneme}: {segment_count} 个片段")
if len(phoneme_counts) > 20:
print(f" ... 还有 {len(phoneme_counts) - 20} 个其他音素")
print(f"\n总片段数: {total_segments}")
# 查看第一个音素的第一个片段示例
if data:
first_phoneme = list(data.keys())[0]
if data[first_phoneme]:
first_segment = data[first_phoneme][0]
print(f"\n=== 数据片段示例 (音素: {first_phoneme}) ===")
for key, value in first_segment.items():
if hasattr(value, 'shape'):
print(f" {key}: shape {value.shape}")
elif hasattr(value, '__len__'):
print(f" {key}: length {len(value)}")
else:
print(f" {key}: {value}")
else:
print(f"数据不是字典格式: {type(data)}")
if hasattr(data, '__len__'):
print(f"数据长度: {len(data)}")
except EOFError as e:
print(f"文件可能损坏或不完整: {e}")
except pickle.UnpicklingError as e:
print(f"Pickle解析错误: {e}")
except Exception as e:
print(f"读取文件时发生错误: {e}")
print(f"错误类型: {type(e)}")
if __name__ == "__main__":
file_path = "../phoneme_segmented_data/phoneme_dataset_20251007_194413.pkl"
safe_inspect_phoneme_dataset(file_path)