76 lines
2.4 KiB
Python
76 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import pickle
|
|
from pathlib import Path
|
|
|
|
def check_dataset():
|
|
"""Pure Python check of the dataset without dependencies"""
|
|
|
|
dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")
|
|
|
|
if not dataset_file.exists():
|
|
print(f"❌ File not found: {dataset_file}")
|
|
return False
|
|
|
|
print(f"📁 Dataset: {dataset_file}")
|
|
print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")
|
|
|
|
try:
|
|
# Load the dataset (it's saved as batches)
|
|
all_trials = []
|
|
with open(dataset_file, 'rb') as f:
|
|
while True:
|
|
try:
|
|
batch = pickle.load(f)
|
|
all_trials.extend(batch)
|
|
except EOFError:
|
|
break
|
|
|
|
print(f"✅ Loaded {len(all_trials)} total trials")
|
|
|
|
# Analyze alignment_info for temporal ordering
|
|
total_segments = 0
|
|
temporal_errors = 0
|
|
sample_outputs = []
|
|
|
|
for i, trial in enumerate(all_trials):
|
|
if 'alignment_info' not in trial:
|
|
continue
|
|
|
|
alignment_info = trial['alignment_info']
|
|
|
|
for phoneme, start_time, end_time, confidence in alignment_info:
|
|
total_segments += 1
|
|
|
|
# Check temporal ordering
|
|
if end_time < start_time:
|
|
temporal_errors += 1
|
|
if len(sample_outputs) < 5: # Collect first 5 errors as examples
|
|
sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")
|
|
|
|
print(f"📊 Analysis Results:")
|
|
print(f" Total phoneme segments: {total_segments}")
|
|
print(f" Temporal ordering errors: {temporal_errors}")
|
|
|
|
if temporal_errors > 0:
|
|
error_rate = (temporal_errors / total_segments) * 100
|
|
print(f" Error rate: {error_rate:.3f}%")
|
|
print(f" Sample errors:")
|
|
for error in sample_outputs:
|
|
print(f" {error}")
|
|
|
|
return temporal_errors == 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error loading dataset: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = check_dataset()
|
|
|
|
if success:
|
|
print(f"\n🎉 VERIFICATION SUCCESS!")
|
|
print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
|
|
else:
|
|
print(f"\n❌ VERIFICATION FAILED!")
|
|
print(f"Some temporal ordering issues detected") |