87 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			87 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | ||
| """
 | ||
| 安全检查音素数据集文件
 | ||
| 处理大文件和可能的损坏问题
 | ||
| """
 | ||
| 
 | ||
| import pickle
 | ||
| import os
 | ||
| 
 | ||
| def safe_inspect_phoneme_dataset(file_path):
 | ||
|     """安全地检查音素数据集文件"""
 | ||
| 
 | ||
|     print(f"检查文件: {file_path}")
 | ||
| 
 | ||
|     if not os.path.exists(file_path):
 | ||
|         print("文件不存在!")
 | ||
|         return
 | ||
| 
 | ||
|     # 文件基本信息
 | ||
|     file_size = os.path.getsize(file_path)
 | ||
|     print(f"文件大小: {file_size / (1024*1024*1024):.2f} GB")
 | ||
| 
 | ||
|     try:
 | ||
|         # 尝试逐步读取
 | ||
|         with open(file_path, 'rb') as f:
 | ||
|             print("开始读取pickle文件...")
 | ||
| 
 | ||
|             # 尝试读取文件头部
 | ||
|             try:
 | ||
|                 data = pickle.load(f)
 | ||
|                 print("文件读取成功!")
 | ||
| 
 | ||
|                 print(f"数据类型: {type(data)}")
 | ||
| 
 | ||
|                 if isinstance(data, dict):
 | ||
|                     print(f"\n=== 数据集统计 ===")
 | ||
|                     print(f"音素类型数量: {len(data)}")
 | ||
| 
 | ||
|                     # 统计总片段数(不加载所有数据到内存)
 | ||
|                     total_segments = 0
 | ||
|                     phoneme_counts = {}
 | ||
| 
 | ||
|                     for phoneme, segments in data.items():
 | ||
|                         segment_count = len(segments) if segments else 0
 | ||
|                         phoneme_counts[phoneme] = segment_count
 | ||
|                         total_segments += segment_count
 | ||
| 
 | ||
|                         # 只显示前20个音素,避免输出过长
 | ||
|                         if len(phoneme_counts) <= 20:
 | ||
|                             print(f"  {phoneme}: {segment_count} 个片段")
 | ||
| 
 | ||
|                     if len(phoneme_counts) > 20:
 | ||
|                         print(f"  ... 还有 {len(phoneme_counts) - 20} 个其他音素")
 | ||
| 
 | ||
|                     print(f"\n总片段数: {total_segments}")
 | ||
| 
 | ||
|                     # 查看第一个音素的第一个片段示例
 | ||
|                     if data:
 | ||
|                         first_phoneme = list(data.keys())[0]
 | ||
|                         if data[first_phoneme]:
 | ||
|                             first_segment = data[first_phoneme][0]
 | ||
|                             print(f"\n=== 数据片段示例 (音素: {first_phoneme}) ===")
 | ||
|                             for key, value in first_segment.items():
 | ||
|                                 if hasattr(value, 'shape'):
 | ||
|                                     print(f"  {key}: shape {value.shape}")
 | ||
|                                 elif hasattr(value, '__len__'):
 | ||
|                                     print(f"  {key}: length {len(value)}")
 | ||
|                                 else:
 | ||
|                                     print(f"  {key}: {value}")
 | ||
| 
 | ||
|                 else:
 | ||
|                     print(f"数据不是字典格式: {type(data)}")
 | ||
|                     if hasattr(data, '__len__'):
 | ||
|                         print(f"数据长度: {len(data)}")
 | ||
| 
 | ||
|             except EOFError as e:
 | ||
|                 print(f"文件可能损坏或不完整: {e}")
 | ||
|             except pickle.UnpicklingError as e:
 | ||
|                 print(f"Pickle解析错误: {e}")
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         print(f"读取文件时发生错误: {e}")
 | ||
|         print(f"错误类型: {type(e)}")
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     file_path = "../phoneme_segmented_data/phoneme_dataset_20251007_194413.pkl"
 | ||
|     safe_inspect_phoneme_dataset(file_path) | 
