-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_parser.py
More file actions
199 lines (157 loc) · 7.67 KB
/
data_parser.py
File metadata and controls
199 lines (157 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import random
import numpy as np
import pandas as pd
import pickle
from glob import glob
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from config import PATH_TO_FEATURES, PATH_TO_LABELS, PARTITION_FILES
def get_data_partition(partition_file):
'''
Reads mappings from subject ids to their partition and vice versa
:param partition_file: path to the partition file (csv with two columns: id, partition)
:return: dicts subject2partition, partition2subject
'''
subject2partition, partition2subject = {}, {}
if not os.path.exists(partition_file):
print(os.path.abspath(partition_file))
df = pd.read_csv(partition_file)
for row in df.values:
subject, partition = str(row[0]), row[-1]
subject2partition[subject] = partition
if partition not in partition2subject:
partition2subject[partition] = []
if subject not in partition2subject[partition]:
partition2subject[partition].append(subject)
return subject2partition, partition2subject
def get_all_training_csvs(task, feature):
'''
Loads a list of all feature csvs that are used for training a certain task
:param task: humor, stress etc.
:param feature: name of the feature folder (e.g. 'egemaps')
:return: list of csvs
'''
_, partition_to_subject = get_data_partition(PARTITION_FILES[task])
feature_dir = os.path.join(PATH_TO_FEATURES[task], feature)
csvs = []
for subject in tqdm(partition_to_subject['train']):
if task == 'stress':
csvs.append(os.path.join(feature_dir, f'{subject}.csv'))
elif task == 'reaction':
subject = subject[1:-1]
csvs.append(os.path.join(feature_dir, f'{subject}.csv'))
elif task == 'humor':
csvs.extend(sorted(glob(os.path.join(feature_dir, subject, "*.csv"))))
return csvs
def fit_normalizer(task, feature, feature_idx=2):
'''
Fits a sklearn StandardScaler based on training data
:param task: task
:param feature: feature
:param feature_idx: index in the feature csv where the features start
(typically 2, features starting after segment_id, timestamp)
:return: fitted sklearn.preprocessing.StandardScaler
'''
training_csvs = get_all_training_csvs(task, feature)
if task == 'reaction':
print('Concatenating csvs')
df = pd.concat([pd.read_csv(training_csv) for training_csv in tqdm(training_csvs)])
else:
df = pd.concat([pd.read_csv(training_csv) for training_csv in training_csvs])
values = df.iloc[:,feature_idx:].values
print(f'Scaling values')
normalizer = StandardScaler().fit(values)
return normalizer
def segment_stress(sample, win_len, hop_len):
segmented_sample = []
assert hop_len <= win_len and win_len >= 10
for s_idx in range(0, len(sample), hop_len):
e_idx = min(s_idx + win_len, len(sample))
segment = sample.iloc[s_idx:e_idx]
segmented_sample.append(segment)
if e_idx == len(sample):
break
return segmented_sample
def normalize_data(data, idx_list, column_name='feature'):
train_data = np.row_stack(data['train'][column_name])
train_mean = np.nanmean(train_data, axis=0)
train_std = np.nanstd(train_data, axis=0)
for partition in data.keys():
for i in range(len(data[partition][column_name])):
for s_idx, e_idx in idx_list:
data[partition][column_name][i][:, s_idx:e_idx] = \
(data[partition][column_name][i][:, s_idx:e_idx] - train_mean[s_idx:e_idx]) / (
train_std[s_idx:e_idx] + 1e-6)
data[partition][column_name][i][:, s_idx:e_idx] = np.where(
np.isnan(data[partition][column_name][i][:, s_idx:e_idx]), 0.0,
data[partition][column_name][i][:, s_idx:e_idx])
return data
def load_data_for_stress(task, paths, feature_set, emo_dim, normalize=True, norm_opts=None):
feature_path = paths['features']
label_path = paths['labels']
print(feature_path)
print(label_path)
print('Constructing data from scratch ...')
data = {'train': {'feature': [], 'label_valence': [],'label_arousal':[], 'meta': []},
'devel': {'feature': [], 'label_valence': [],'label_arousal':[], 'meta': []},
'test': {'feature': [], 'label_valence': [],'label_arousal':[], 'meta': []}}
vid2partition, partition2vid = get_data_partition(paths['partition'])
feature_dims = [0] * len(feature_set)
feature_idx = 2
print(partition2vid)
for partition, vids in partition2vid.items():
for vid in vids:
sample_data = []
for i, feature in enumerate(feature_set):
feature_file = os.path.join(feature_path, feature, vid + '.csv')
assert os.path.exists(
feature_file), f'Error: no available "{feature}" feature file for video "{vid}": "{feature_file}".'
df = pd.read_csv(feature_file)
feature_dims[i] = df.shape[1] - feature_idx
if i == 0:
feature_data = df
segment_ids_per_step = df.iloc[:, 1]
else:
feature_data = df.iloc[:, feature_idx:]
sample_data.append(feature_data)
data[partition]['feature_dims'] = feature_dims
label_file_valence = os.path.join(label_path, "valence", vid + '.csv')
label_file_arousal = os.path.join(label_path, "physio-arousal", vid + '.csv')
assert os.path.exists(
label_file_valence), f'Error: no available "{emo_dim}" label file for video "{vid}": "{label_file_valence}".'
df_valence = pd.read_csv(label_file_valence)
df_arousal = pd.read_csv(label_file_arousal)
label_data_valence = pd.DataFrame(data=df_valence['value'].values, columns=["valence"])
label_data_arousal = pd.DataFrame(data=df_arousal['value'].values, columns=["arousal"])
sample_data.append(label_data_valence)
sample_data.append(label_data_arousal)
sample_data = pd.concat(sample_data, axis=1)
if partition != 'test':
sample_data = sample_data.dropna()
samples = [sample_data]
for i, segment in enumerate(samples):
n_emo_dims = 2
if len(segment.iloc[:, feature_idx:-n_emo_dims].values) > 0:
meta = np.column_stack((np.array([int(vid)] * len(segment)),
segment.iloc[:, :feature_idx].values))
data[partition]['meta'].append(meta)
data[partition]['label_valence'].append(segment.iloc[:, -2:-1].values)
data[partition]['label_arousal'].append(segment.iloc[:, -1:].values)
data[partition]['feature'].append(segment.iloc[:, feature_idx:-n_emo_dims].values)
if normalize:
idx_list = []
assert norm_opts is not None and len(norm_opts) == len(feature_set)
norm_opts = [True if norm_opt == 'y' else False for norm_opt in norm_opts]
print(f'Feature dims: {feature_dims} ({feature_set})')
feature_dims = np.cumsum(feature_dims).tolist()
feature_dims = [0] + feature_dims
norm_feature_set = []
for i, (s_idx, e_idx) in enumerate(zip(feature_dims[0:-1], feature_dims[1:])):
norm_opt, feature = norm_opts[i], feature_set[i]
if norm_opt:
norm_feature_set.append(feature)
idx_list.append([s_idx, e_idx])
print(f'Normalized features: {norm_feature_set}')
data = normalize_data(data, idx_list)
return data