Files
b2txt25/data_analyse/pure_python_check.py

76 lines
2.4 KiB
Python
Raw Normal View History

2025-10-12 09:11:32 +08:00
#!/usr/bin/env python3
import pickle
from pathlib import Path
def check_dataset():
"""Pure Python check of the dataset without dependencies"""
dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")
if not dataset_file.exists():
print(f"❌ File not found: {dataset_file}")
return False
print(f"📁 Dataset: {dataset_file}")
print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")
try:
# Load the dataset (it's saved as batches)
all_trials = []
with open(dataset_file, 'rb') as f:
while True:
try:
batch = pickle.load(f)
all_trials.extend(batch)
except EOFError:
break
print(f"✅ Loaded {len(all_trials)} total trials")
# Analyze alignment_info for temporal ordering
total_segments = 0
temporal_errors = 0
sample_outputs = []
for i, trial in enumerate(all_trials):
if 'alignment_info' not in trial:
continue
alignment_info = trial['alignment_info']
for phoneme, start_time, end_time, confidence in alignment_info:
total_segments += 1
# Check temporal ordering
if end_time < start_time:
temporal_errors += 1
if len(sample_outputs) < 5: # Collect first 5 errors as examples
sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")
print(f"📊 Analysis Results:")
print(f" Total phoneme segments: {total_segments}")
print(f" Temporal ordering errors: {temporal_errors}")
if temporal_errors > 0:
error_rate = (temporal_errors / total_segments) * 100
print(f" Error rate: {error_rate:.3f}%")
print(f" Sample errors:")
for error in sample_outputs:
print(f" {error}")
return temporal_errors == 0
except Exception as e:
print(f"❌ Error loading dataset: {e}")
return False
if __name__ == "__main__":
success = check_dataset()
if success:
print(f"\n🎉 VERIFICATION SUCCESS!")
print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
else:
print(f"\n❌ VERIFICATION FAILED!")
print(f"Some temporal ordering issues detected")