285 lines
11 KiB
Python
285 lines
11 KiB
Python
![]() |
#!/usr/bin/env python3
|
|||
|
"""
|
|||
|
Create phoneme classification dataset from segmented data (simplified version)
|
|||
|
创建音素分类数据集(简化版本,不进行ground truth验证)
|
|||
|
"""
|
|||
|
|
|||
|
import pickle
|
|||
|
import numpy as np
|
|||
|
import torch
|
|||
|
from pathlib import Path
|
|||
|
from collections import defaultdict
|
|||
|
import os
|
|||
|
import sys
|
|||
|
|
|||
|
# Add parent directory to path for imports
|
|||
|
sys.path.append(str(Path(__file__).parent.parent))
|
|||
|
|
|||
|
def load_neural_data_for_trial(session, trial_metadata):
|
|||
|
"""Load neural features for a specific trial"""
|
|||
|
try:
|
|||
|
# Simplified approach - directly use h5py without the helper function
|
|||
|
import h5py
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
# Try to load the session data
|
|||
|
data_dir = Path(__file__).parent.parent / "data" / "hdf5_data_final"
|
|||
|
train_file = data_dir / session / "data_train.hdf5"
|
|||
|
|
|||
|
if not train_file.exists():
|
|||
|
return None
|
|||
|
|
|||
|
# Load HDF5 file directly
|
|||
|
with h5py.File(train_file, 'r') as f:
|
|||
|
if 'neural_features' in f:
|
|||
|
neural_features = f['neural_features'][:]
|
|||
|
|
|||
|
# Find the matching trial by trial_num
|
|||
|
trial_num = trial_metadata.get('trial_num', -1)
|
|||
|
if trial_num >= 0 and trial_num < len(neural_features):
|
|||
|
return neural_features[trial_num]
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f"Warning: Could not load neural data for {session}, trial {trial_metadata.get('trial_num', 'unknown')}: {e}")
|
|||
|
|
|||
|
return None
|
|||
|
|
|||
|
def create_phoneme_classification_dataset():
|
|||
|
"""Create a phoneme classification dataset from segmented data without validation"""
|
|||
|
|
|||
|
# Load the latest phoneme dataset
|
|||
|
data_dir = Path("phoneme_segmented_data")
|
|||
|
dataset_files = list(data_dir.glob("phoneme_dataset_*.pkl"))
|
|||
|
|
|||
|
if not dataset_files:
|
|||
|
print("No phoneme dataset files found!")
|
|||
|
return
|
|||
|
|
|||
|
latest_dataset = max(dataset_files, key=lambda x: x.stat().st_mtime)
|
|||
|
print(f"Loading dataset: {latest_dataset.name}")
|
|||
|
|
|||
|
with open(latest_dataset, 'rb') as f:
|
|||
|
phoneme_data = pickle.load(f)
|
|||
|
|
|||
|
print(f"Loaded {len(phoneme_data)} phoneme types")
|
|||
|
|
|||
|
# Create classification dataset
|
|||
|
classification_data = {
|
|||
|
'features': [], # Neural features for each segment
|
|||
|
'labels': [], # Phoneme labels
|
|||
|
'phoneme_to_id': {}, # Phoneme to numeric ID mapping
|
|||
|
'id_to_phoneme': {}, # Numeric ID to phoneme mapping
|
|||
|
'metadata': [] # Additional metadata for each sample
|
|||
|
}
|
|||
|
|
|||
|
# Create phoneme to ID mapping
|
|||
|
unique_phonemes = sorted(phoneme_data.keys())
|
|||
|
for i, phoneme in enumerate(unique_phonemes):
|
|||
|
classification_data['phoneme_to_id'][phoneme] = i
|
|||
|
classification_data['id_to_phoneme'][i] = phoneme
|
|||
|
|
|||
|
print(f"\nPhoneme mapping created for {len(unique_phonemes)} phonemes:")
|
|||
|
for i, phoneme in enumerate(unique_phonemes[:10]): # Show first 10
|
|||
|
print(f" {i:2d}: '{phoneme}'")
|
|||
|
if len(unique_phonemes) > 10:
|
|||
|
print(f" ... and {len(unique_phonemes) - 10} more")
|
|||
|
|
|||
|
# Processing statistics
|
|||
|
processing_stats = {
|
|||
|
'total_segments': 0,
|
|||
|
'successful_extractions': 0,
|
|||
|
'failed_extractions': 0,
|
|||
|
'sessions_processed': set(),
|
|||
|
'trials_processed': set()
|
|||
|
}
|
|||
|
|
|||
|
print(f"\nExtracting neural features for each phoneme segment...")
|
|||
|
|
|||
|
for phoneme, segments in phoneme_data.items():
|
|||
|
phoneme_id = classification_data['phoneme_to_id'][phoneme]
|
|||
|
print(f"Processing '{phoneme}' ({len(segments)} segments)...")
|
|||
|
|
|||
|
for segment_idx, segment in enumerate(segments):
|
|||
|
processing_stats['total_segments'] += 1
|
|||
|
|
|||
|
# Get trial information
|
|||
|
session = segment['session']
|
|||
|
trial_num = segment.get('trial_num', -1)
|
|||
|
|
|||
|
processing_stats['sessions_processed'].add(session)
|
|||
|
processing_stats['trials_processed'].add((session, trial_num))
|
|||
|
|
|||
|
# Try to load neural data for this trial
|
|||
|
neural_features = load_neural_data_for_trial(session, segment)
|
|||
|
|
|||
|
if neural_features is not None:
|
|||
|
# Extract the specific time segment
|
|||
|
start_time = int(segment['start_time'])
|
|||
|
end_time = int(segment['end_time'])
|
|||
|
|
|||
|
# Ensure valid time range
|
|||
|
if start_time <= end_time and end_time < len(neural_features):
|
|||
|
# Extract neural features for this time segment
|
|||
|
segment_features = neural_features[start_time:end_time+1] # Include end_time
|
|||
|
|
|||
|
# Convert to numpy array and handle different cases
|
|||
|
if isinstance(segment_features, torch.Tensor):
|
|||
|
segment_features = segment_features.numpy()
|
|||
|
elif isinstance(segment_features, list):
|
|||
|
segment_features = np.array(segment_features)
|
|||
|
|
|||
|
# For classification, we need a fixed-size feature vector
|
|||
|
# Option 1: Use mean across time steps
|
|||
|
if len(segment_features.shape) == 2: # (time, features)
|
|||
|
feature_vector = np.mean(segment_features, axis=0)
|
|||
|
elif len(segment_features.shape) == 1: # Already 1D
|
|||
|
feature_vector = segment_features
|
|||
|
else:
|
|||
|
print(f"Unexpected feature shape: {segment_features.shape}")
|
|||
|
processing_stats['failed_extractions'] += 1
|
|||
|
continue
|
|||
|
|
|||
|
# Add to dataset
|
|||
|
classification_data['features'].append(feature_vector)
|
|||
|
classification_data['labels'].append(phoneme_id)
|
|||
|
classification_data['metadata'].append({
|
|||
|
'phoneme': phoneme,
|
|||
|
'session': session,
|
|||
|
'trial_num': trial_num,
|
|||
|
'trial_idx': segment.get('trial_idx', -1),
|
|||
|
'start_time': start_time,
|
|||
|
'end_time': end_time,
|
|||
|
'duration': end_time - start_time + 1,
|
|||
|
'confidence': segment.get('confidence', 0.0),
|
|||
|
'corpus': segment.get('corpus', 'unknown'),
|
|||
|
'block_num': segment.get('block_num', -1)
|
|||
|
})
|
|||
|
|
|||
|
processing_stats['successful_extractions'] += 1
|
|||
|
else:
|
|||
|
processing_stats['failed_extractions'] += 1
|
|||
|
else:
|
|||
|
processing_stats['failed_extractions'] += 1
|
|||
|
|
|||
|
# Progress update
|
|||
|
if processing_stats['total_segments'] % 5000 == 0:
|
|||
|
print(f" Processed {processing_stats['total_segments']} segments, extracted {processing_stats['successful_extractions']} features")
|
|||
|
|
|||
|
print(f"\nDataset creation completed!")
|
|||
|
print(f"Total segments processed: {processing_stats['total_segments']}")
|
|||
|
print(f"Successful feature extractions: {processing_stats['successful_extractions']}")
|
|||
|
print(f"Failed extractions: {processing_stats['failed_extractions']}")
|
|||
|
print(f"Success rate: {processing_stats['successful_extractions']/processing_stats['total_segments']*100:.1f}%")
|
|||
|
print(f"Sessions processed: {len(processing_stats['sessions_processed'])}")
|
|||
|
print(f"Unique trials processed: {len(processing_stats['trials_processed'])}")
|
|||
|
|
|||
|
if processing_stats['successful_extractions'] == 0:
|
|||
|
print("No features were extracted. Check neural data availability.")
|
|||
|
return
|
|||
|
|
|||
|
# Convert to numpy arrays
|
|||
|
classification_data['features'] = np.array(classification_data['features'])
|
|||
|
classification_data['labels'] = np.array(classification_data['labels'])
|
|||
|
|
|||
|
print(f"\nFinal dataset shape:")
|
|||
|
print(f"Features: {classification_data['features'].shape}")
|
|||
|
print(f"Labels: {classification_data['labels'].shape}")
|
|||
|
|
|||
|
# Show class distribution
|
|||
|
print(f"\nClass distribution:")
|
|||
|
unique_labels, counts = np.unique(classification_data['labels'], return_counts=True)
|
|||
|
for label_id, count in zip(unique_labels, counts):
|
|||
|
phoneme = classification_data['id_to_phoneme'][label_id]
|
|||
|
print(f" {label_id:2d} ('{phoneme}'): {count:4d} samples")
|
|||
|
|
|||
|
# Save the classification dataset
|
|||
|
timestamp = latest_dataset.name.split('_')[-1].replace('.pkl', '')
|
|||
|
output_file = f"phoneme_classification_dataset_simple_{timestamp}.pkl"
|
|||
|
output_path = data_dir / output_file
|
|||
|
|
|||
|
# Add processing stats to the dataset
|
|||
|
classification_data['processing_stats'] = processing_stats
|
|||
|
|
|||
|
with open(output_path, 'wb') as f:
|
|||
|
pickle.dump(classification_data, f)
|
|||
|
|
|||
|
print(f"\nClassification dataset saved to: {output_file}")
|
|||
|
|
|||
|
# Create a simple train/test split
|
|||
|
create_train_test_split(classification_data, data_dir, timestamp)
|
|||
|
|
|||
|
return classification_data
|
|||
|
|
|||
|
def create_train_test_split(data, data_dir, timestamp):
|
|||
|
"""Create train/test split for the classification dataset"""
|
|||
|
|
|||
|
from sklearn.model_selection import train_test_split
|
|||
|
from sklearn.preprocessing import StandardScaler
|
|||
|
|
|||
|
print(f"\nCreating train/test split...")
|
|||
|
|
|||
|
X = data['features']
|
|||
|
y = data['labels']
|
|||
|
metadata = data['metadata']
|
|||
|
|
|||
|
# Split by session to avoid data leakage
|
|||
|
sessions = [meta['session'] for meta in metadata]
|
|||
|
unique_sessions = list(set(sessions))
|
|||
|
|
|||
|
print(f"Available sessions: {len(unique_sessions)}")
|
|||
|
|
|||
|
if len(unique_sessions) >= 4:
|
|||
|
# Use session-based split
|
|||
|
train_sessions = unique_sessions[:int(len(unique_sessions) * 0.8)]
|
|||
|
test_sessions = unique_sessions[int(len(unique_sessions) * 0.8):]
|
|||
|
|
|||
|
train_indices = [i for i, meta in enumerate(metadata) if meta['session'] in train_sessions]
|
|||
|
test_indices = [i for i, meta in enumerate(metadata) if meta['session'] in test_sessions]
|
|||
|
|
|||
|
X_train, X_test = X[train_indices], X[test_indices]
|
|||
|
y_train, y_test = y[train_indices], y[test_indices]
|
|||
|
|
|||
|
print(f"Session-based split:")
|
|||
|
print(f" Train sessions: {len(train_sessions)}")
|
|||
|
print(f" Test sessions: {len(test_sessions)}")
|
|||
|
else:
|
|||
|
# Use random split
|
|||
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|||
|
)
|
|||
|
print(f"Random split (stratified):")
|
|||
|
|
|||
|
print(f" Train samples: {len(X_train)}")
|
|||
|
print(f" Test samples: {len(X_test)}")
|
|||
|
|
|||
|
# Standardize features
|
|||
|
scaler = StandardScaler()
|
|||
|
X_train_scaled = scaler.fit_transform(X_train)
|
|||
|
X_test_scaled = scaler.transform(X_test)
|
|||
|
|
|||
|
# Save split data
|
|||
|
split_data = {
|
|||
|
'X_train': X_train_scaled,
|
|||
|
'X_test': X_test_scaled,
|
|||
|
'y_train': y_train,
|
|||
|
'y_test': y_test,
|
|||
|
'scaler': scaler,
|
|||
|
'phoneme_to_id': data['phoneme_to_id'],
|
|||
|
'id_to_phoneme': data['id_to_phoneme']
|
|||
|
}
|
|||
|
|
|||
|
split_file = f"phoneme_classification_split_simple_{timestamp}.pkl"
|
|||
|
split_path = data_dir / split_file
|
|||
|
|
|||
|
with open(split_path, 'wb') as f:
|
|||
|
pickle.dump(split_data, f)
|
|||
|
|
|||
|
print(f"Train/test split saved to: {split_file}")
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
try:
|
|||
|
classification_data = create_phoneme_classification_dataset()
|
|||
|
except Exception as e:
|
|||
|
print(f"Error creating classification dataset: {e}")
|
|||
|
import traceback
|
|||
|
traceback.print_exc()
|