Files
b2txt25/data_analyse/safe_inspect.py
2025-10-12 09:11:32 +08:00

87 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
安全检查音素数据集文件
处理大文件和可能的损坏问题
"""
import pickle
import os
def safe_inspect_phoneme_dataset(file_path):
"""安全地检查音素数据集文件"""
print(f"检查文件: {file_path}")
if not os.path.exists(file_path):
print("文件不存在!")
return
# 文件基本信息
file_size = os.path.getsize(file_path)
print(f"文件大小: {file_size / (1024*1024*1024):.2f} GB")
try:
# 尝试逐步读取
with open(file_path, 'rb') as f:
print("开始读取pickle文件...")
# 尝试读取文件头部
try:
data = pickle.load(f)
print("文件读取成功!")
print(f"数据类型: {type(data)}")
if isinstance(data, dict):
print(f"\n=== 数据集统计 ===")
print(f"音素类型数量: {len(data)}")
# 统计总片段数(不加载所有数据到内存)
total_segments = 0
phoneme_counts = {}
for phoneme, segments in data.items():
segment_count = len(segments) if segments else 0
phoneme_counts[phoneme] = segment_count
total_segments += segment_count
# 只显示前20个音素避免输出过长
if len(phoneme_counts) <= 20:
print(f" {phoneme}: {segment_count} 个片段")
if len(phoneme_counts) > 20:
print(f" ... 还有 {len(phoneme_counts) - 20} 个其他音素")
print(f"\n总片段数: {total_segments}")
# 查看第一个音素的第一个片段示例
if data:
first_phoneme = list(data.keys())[0]
if data[first_phoneme]:
first_segment = data[first_phoneme][0]
print(f"\n=== 数据片段示例 (音素: {first_phoneme}) ===")
for key, value in first_segment.items():
if hasattr(value, 'shape'):
print(f" {key}: shape {value.shape}")
elif hasattr(value, '__len__'):
print(f" {key}: length {len(value)}")
else:
print(f" {key}: {value}")
else:
print(f"数据不是字典格式: {type(data)}")
if hasattr(data, '__len__'):
print(f"数据长度: {len(data)}")
except EOFError as e:
print(f"文件可能损坏或不完整: {e}")
except pickle.UnpicklingError as e:
print(f"Pickle解析错误: {e}")
except Exception as e:
print(f"读取文件时发生错误: {e}")
print(f"错误类型: {type(e)}")
if __name__ == "__main__":
file_path = "../phoneme_segmented_data/phoneme_dataset_20251007_194413.pkl"
safe_inspect_phoneme_dataset(file_path)