165 lines
6.1 KiB
Python
165 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Display the contents and structure of the phoneme dataset pkl files
|
|
"""
|
|
|
|
import pickle
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
def show_phoneme_dataset_structure():
|
|
"""Show the structure of phoneme dataset pkl files"""
|
|
|
|
data_dir = Path("phoneme_segmented_data")
|
|
if not data_dir.exists():
|
|
print("No phoneme_segmented_data directory found")
|
|
return
|
|
|
|
# Find all dataset files
|
|
dataset_files = list(data_dir.glob("phoneme_dataset_*.pkl"))
|
|
ctc_files = list(data_dir.glob("ctc_results_*.pkl"))
|
|
|
|
print("=== PKL Files in phoneme_segmented_data/ ===")
|
|
print(f"Found {len(dataset_files)} phoneme dataset files:")
|
|
for f in sorted(dataset_files):
|
|
size_mb = f.stat().st_size / (1024 * 1024)
|
|
print(f" {f.name} ({size_mb:.1f} MB)")
|
|
|
|
print(f"\nFound {len(ctc_files)} CTC results files:")
|
|
for f in sorted(ctc_files):
|
|
size_mb = f.stat().st_size / (1024 * 1024)
|
|
print(f" {f.name} ({size_mb:.1f} MB)")
|
|
|
|
if not dataset_files:
|
|
print("No dataset files to examine")
|
|
return
|
|
|
|
# Load the latest dataset file
|
|
latest_dataset = max(dataset_files, key=lambda x: x.stat().st_mtime)
|
|
print(f"\n=== Examining Latest Dataset: {latest_dataset.name} ===")
|
|
|
|
with open(latest_dataset, 'rb') as f:
|
|
dataset = pickle.load(f)
|
|
|
|
print(f"Dataset type: {type(dataset)}")
|
|
print(f"Dataset size: {len(dataset)} phoneme types")
|
|
|
|
# Show phoneme types and their counts
|
|
print(f"\n=== Phoneme Types and Counts ===")
|
|
total_segments = 0
|
|
for phoneme, segments in sorted(dataset.items()):
|
|
count = len(segments)
|
|
total_segments += count
|
|
print(f"'{phoneme}': {count:5d} segments")
|
|
|
|
print(f"\nTotal segments across all phonemes: {total_segments:,}")
|
|
|
|
# Show sample segments from a few phonemes
|
|
print(f"\n=== Sample Segment Structure ===")
|
|
sample_phonemes = [' | ', 'AA', 'IH', 'T', 'S'] # Common phonemes
|
|
|
|
for phoneme in sample_phonemes:
|
|
if phoneme in dataset and dataset[phoneme]:
|
|
segment = dataset[phoneme][0] # First segment
|
|
print(f"\nSample segment for '{phoneme}':")
|
|
print(f" Type: {type(segment)}")
|
|
if isinstance(segment, dict):
|
|
for key, value in segment.items():
|
|
if key == 'confidence':
|
|
print(f" {key}: {value:.6f}")
|
|
elif isinstance(value, np.integer):
|
|
print(f" {key}: {int(value)}")
|
|
else:
|
|
print(f" {key}: {value}")
|
|
break
|
|
|
|
# Check for time alignment issues
|
|
print(f"\n=== Time Alignment Check ===")
|
|
error_count = 0
|
|
total_checked = 0
|
|
|
|
for phoneme, segments in dataset.items():
|
|
for segment in segments[:10]: # Check first 10 of each phoneme
|
|
if isinstance(segment, dict) and 'start_time' in segment and 'end_time' in segment:
|
|
start_time = int(segment['start_time'])
|
|
end_time = int(segment['end_time'])
|
|
total_checked += 1
|
|
|
|
if end_time < start_time:
|
|
error_count += 1
|
|
if error_count <= 5: # Show first 5 errors
|
|
print(f" ❌ Error: '{phoneme}' segment has start={start_time}, end={end_time}")
|
|
|
|
if total_checked > 0:
|
|
error_rate = (error_count / total_checked) * 100
|
|
print(f"\nChecked {total_checked} segments:")
|
|
print(f" ✅ Valid segments: {total_checked - error_count}")
|
|
print(f" ❌ Invalid segments: {error_count}")
|
|
print(f" Error rate: {error_rate:.1f}%")
|
|
|
|
# Show session and trial distribution
|
|
print(f"\n=== Session and Trial Distribution ===")
|
|
sessions = set()
|
|
trials = set()
|
|
|
|
for phoneme, segments in dataset.items():
|
|
for segment in segments[:100]: # Sample first 100 of each phoneme
|
|
if isinstance(segment, dict):
|
|
if 'session' in segment:
|
|
sessions.add(segment['session'])
|
|
if 'trial_num' in segment:
|
|
trials.add(f"{segment.get('session', 'unknown')}_trial_{segment['trial_num']}")
|
|
|
|
print(f"Sessions represented: {len(sessions)}")
|
|
if len(sessions) <= 10:
|
|
for session in sorted(sessions):
|
|
print(f" {session}")
|
|
else:
|
|
for session in sorted(list(sessions)[:5]):
|
|
print(f" {session}")
|
|
print(f" ... and {len(sessions) - 5} more")
|
|
|
|
print(f"Unique trials sampled: {len(trials)}")
|
|
|
|
def show_ctc_results_structure():
|
|
"""Show the structure of CTC results pkl files"""
|
|
|
|
data_dir = Path("phoneme_segmented_data")
|
|
ctc_files = list(data_dir.glob("ctc_results_*.pkl"))
|
|
|
|
if not ctc_files:
|
|
print("\n=== No CTC Results Files Found ===")
|
|
return
|
|
|
|
# Load the latest CTC results file
|
|
latest_ctc = max(ctc_files, key=lambda x: x.stat().st_mtime)
|
|
print(f"\n=== Examining Latest CTC Results: {latest_ctc.name} ===")
|
|
|
|
with open(latest_ctc, 'rb') as f:
|
|
ctc_data = pickle.load(f)
|
|
|
|
print(f"CTC data type: {type(ctc_data)}")
|
|
print(f"CTC data length: {len(ctc_data)}")
|
|
|
|
# Show sample CTC result
|
|
if ctc_data:
|
|
sample = ctc_data[0]
|
|
print(f"\nSample CTC result:")
|
|
print(f" Type: {type(sample)}")
|
|
if isinstance(sample, dict):
|
|
for key, value in sample.items():
|
|
if key == 'ctc_score':
|
|
print(f" {key}: {value:.3f}")
|
|
elif key == 'predicted_phonemes' and isinstance(value, list):
|
|
phonemes_str = ' '.join(value[:10]) # First 10 phonemes
|
|
if len(value) > 10:
|
|
phonemes_str += f" ... ({len(value)} total)"
|
|
print(f" {key}: [{phonemes_str}]")
|
|
elif key == 'alignment_info' and isinstance(value, list):
|
|
print(f" {key}: {len(value)} alignment segments")
|
|
else:
|
|
print(f" {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
|
|
|
|
if __name__ == "__main__":
|
|
show_phoneme_dataset_structure()
|
|
show_ctc_results_structure() |