#!/usr/bin/env python3 """ 安全检查音素数据集文件 处理大文件和可能的损坏问题 """ import pickle import os def safe_inspect_phoneme_dataset(file_path): """安全地检查音素数据集文件""" print(f"检查文件: {file_path}") if not os.path.exists(file_path): print("文件不存在!") return # 文件基本信息 file_size = os.path.getsize(file_path) print(f"文件大小: {file_size / (1024*1024*1024):.2f} GB") try: # 尝试逐步读取 with open(file_path, 'rb') as f: print("开始读取pickle文件...") # 尝试读取文件头部 try: data = pickle.load(f) print("文件读取成功!") print(f"数据类型: {type(data)}") if isinstance(data, dict): print(f"\n=== 数据集统计 ===") print(f"音素类型数量: {len(data)}") # 统计总片段数(不加载所有数据到内存) total_segments = 0 phoneme_counts = {} for phoneme, segments in data.items(): segment_count = len(segments) if segments else 0 phoneme_counts[phoneme] = segment_count total_segments += segment_count # 只显示前20个音素,避免输出过长 if len(phoneme_counts) <= 20: print(f" {phoneme}: {segment_count} 个片段") if len(phoneme_counts) > 20: print(f" ... 还有 {len(phoneme_counts) - 20} 个其他音素") print(f"\n总片段数: {total_segments}") # 查看第一个音素的第一个片段示例 if data: first_phoneme = list(data.keys())[0] if data[first_phoneme]: first_segment = data[first_phoneme][0] print(f"\n=== 数据片段示例 (音素: {first_phoneme}) ===") for key, value in first_segment.items(): if hasattr(value, 'shape'): print(f" {key}: shape {value.shape}") elif hasattr(value, '__len__'): print(f" {key}: length {len(value)}") else: print(f" {key}: {value}") else: print(f"数据不是字典格式: {type(data)}") if hasattr(data, '__len__'): print(f"数据长度: {len(data)}") except EOFError as e: print(f"文件可能损坏或不完整: {e}") except pickle.UnpicklingError as e: print(f"Pickle解析错误: {e}") except Exception as e: print(f"读取文件时发生错误: {e}") print(f"错误类型: {type(e)}") if __name__ == "__main__": file_path = "../phoneme_segmented_data/phoneme_dataset_20251007_194413.pkl" safe_inspect_phoneme_dataset(file_path)