-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerators.py
349 lines (300 loc) · 16.5 KB
/
generators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
from __future__ import print_function
from __future__ import absolute_import
from builtins import range
import numpy as np
import pandas as pd
import multiprocessing as mp
from munch import Munch
import os, random
from six import string_types
import keras
from .image_utils import *
from .generic import *
from .tensor_ops import *
# GENERATORS
class DataGeneratorDisk(keras.utils.Sequence):
"""
Generates data for training Keras models
- inherits from keras.utils.Sequence
- reads images from disk and applies `process_fn` to each
- `process_fn` needs to ensure that processed images are of the same size
- on __getitem__() returns an ND-array containing `batch_size` images
ARGUMENTS
* ids (pandas.dataframe): table containing image names, and output variables
* data_path (string): path of image folder
* batch_size (int): how many images to read at a time
* shuffle (bool): randomized reading order
* ids_fn (function): returns an updated `ids` table, replacing `self.ids` at the end of each epoch
* process_fn (function): function applied to each image as it is read
* read_fn (function): function used to read data from a file (returns numpy.array)
if None, image_utils.read_image() is used (default)
* deterministic (None, int): random seed for shuffling order
* inputs (list of strings): column names from `ids` containing image names
* inputs_df (list of strings): column names from `ids`, returns values from the DataFrame itself
* outputs (list of strings): column names from `ids`
* verbose (bool): logging verbosity
* fixed_batches (bool): only return full batches, ignore the last incomplete batch if needed
* process_args (dictionary): dict of corresponding `ids` columns for `inputs`
containing arguments to pass to `process_fn`
* group_names (strings list): read only from specified sub-paths (groups), or from any if `group_names` is None
`group_names` are randomly sampled from meta-groups
i.e. when group_names = [[group_names_1], [group_names_2]]
* random_group (bool): read inputs from a random group for every image
"""
def __init__(self, ids, data_path, **args):
params_defa = Munch(ids = ids, data_path= data_path,
batch_size = 32, shuffle = True,
input_shape = None, ids_fn = None,
process_fn = None, read_fn = None,
deterministic = None, inputs = [],
inputs_df = None, outputs = [],
verbose = False, fixed_batches = False,
random_group = False, group_names = None,
process_args = {}, group_by = None)
check_keys_exist(args, params_defa)
params = updated_dict(params_defa, **args) # update only existing
params.deterministic = {True: 42, False: None}.\
get(params.deterministic,
params.deterministic)
params.process_args = params.process_args or {}
params.group_names = params.group_names or ['']
self.__dict__.update(**params) # set all as self.<param>
if self.verbose>1:
print('Initialized DataGeneratorDisk')
self.on_epoch_end() # initialize indexes
def __len__(self):
"""Get the number of batches per epoch"""
if not self.group_by:
round_op = np.floor if self.fixed_batches else np.ceil
return int(round_op(len(self.ids)*1. / self.batch_size))
else:
return int(self.ids_index.batch_index.max()+1)
def __getitem__(self, index):
"""Generate a batch of data"""
if self.verbose:
show_progress(index, len(self), prefix='Generating batches')
ids_batch = self.ids[self.ids_index.batch_index==index]
# reshuffle to remove ordering by index
if self.shuffle:
ids_batch = ids_batch.reset_index(drop=True).\
sample(frac=1, random_state=self.deterministic)
return self._data_generation(ids_batch)
def on_epoch_end(self):
"""Updates batch selection after each epoch"""
if self.ids_fn is not None:
self.ids = self.ids_fn()
if self.group_by:
group_dict = dict(group_by=self.ids[self.group_by])
else:
group_dict = None
self.ids_index = pd.DataFrame(group_dict,
index=self.ids.index.copy())
self.ids_index['batch_index'] = -1
# initialize batch indexes
index = 0
selectable = self.ids_index.batch_index == -1
while selectable.sum():
ids_sel = self.ids_index[selectable]
if self.group_by:
group_by_value = ids_sel.group_by.sample(1,
random_state=self.deterministic).values[0]
ids_sel = ids_sel[ids_sel.group_by==group_by_value]
batch_size_max = min(self.batch_size, len(ids_sel))
if self.shuffle:
ids_batch = ids_sel.sample(batch_size_max,
random_state=self.deterministic)
else:
ids_batch = ids_sel.iloc[:batch_size_max]
self.ids_index.loc[ids_batch.index, 'batch_index'] = index
index += 1
selectable = self.ids_index.batch_index == -1
def _access_field(self, ids_batch, accessor):
if isinstance(ids_batch[accessor].values[0][0], np.ndarray):
return np.stack(ids_batch[accessor].values.squeeze(), axis=0)
else:
return ids_batch[accessor].values
def _read_data(self, ids_batch, accessor):
X = []
if accessor:
assert isinstance(accessor, list) or callable(accessor),\
'Generator inputs/outputs must be of type list, or function'
if callable(accessor):
X = accessor(ids_batch)
elif isinstance(accessor, list):
if all(isinstance(a, list) for a in accessor):
X = [self._access_field(ids_batch,a) for a in accessor]
else:
assert all(not isinstance(a, list) for a in accessor)
X = [self._access_field(ids_batch,accessor)]
else:
raise Exception('Wrong generator input/output specifications')
return X
def _data_generation(self, ids_batch):
"""Generates image-stack + outputs containing batch_size samples"""
params = self
np.random.seed(params.deterministic)
y = self._read_data(ids_batch, params.outputs)
X_list = self._read_data(ids_batch, params.inputs_df)
assert isinstance(params.inputs, list),\
'Generator inputs/outputs must be of type list'
# group_names are randomly sampled from meta-groups
# i.e. when group_names = [[group_names1], [group_names2]]
group_names = params.group_names
if isinstance(group_names[0], list):
idx = np.random.randint(0, len(group_names))
group_names = group_names[idx]
if isinstance(params.data_path, string_types):
# get data for each input and add it to X_list
for group_name in group_names:
if params.random_group:
group_path = os.path.join(params.data_path, group_name) or '.'
subdir_names = [f for f in os.listdir(group_path)
if os.path.isdir(os.path.join(group_path, f))]
subgroup_name = random.choice(subdir_names)
else:
subgroup_name = ''
for input_name in params.inputs:
data = []
# read the data from disk into a list
for row in ids_batch.itertuples():
input_data = os.path.join(group_name, subgroup_name, getattr(row, input_name))
if params.read_fn is None:
file_path = os.path.join(params.data_path, input_data)
file_data = read_image(file_path)
else:
file_data = params.read_fn(input_data, params)
data.append(file_data)
# column name for the arguments to `process_fn`
args_name = params.process_args.get(input_name, None)
# if needed, process each image, and add to X_list (inputs list)
if params.process_fn not in [None, False]:
data_list = []
for i, row in enumerate(ids_batch.itertuples()):
args = []
if args_name is not None:
args = [getattr(row, name) for name in force_list(args_name)]
data_i = params.process_fn(data[i], *args)
data_list.append(force_list(data_i))
# transpose list, sublists become batches
data_list = zip(*data_list)
# for each sublist of arrays
data_arrays = []
for batch_list in data_list:
batch_arr = np.float32(np.stack(batch_list))
data_arrays.append(batch_arr)
X_list.extend(data_arrays)
else:
data_array = np.float32(np.stack(data))
X_list.append(data_array)
np.random.seed(None)
return (X_list, y)
class DataGeneratorHDF5(DataGeneratorDisk):
"""
Generates data for training Keras models
- similar to the `DataGeneratorDisk`, but reads data instances from an HDF5 file e.g. images, features
- inherits from `DataGeneratorDisk`, a child of keras.utils.Sequence
- applies `process_fn` to each data instance
- `process_fn` needs to ensure a fixed size for processed data instances
- on __getitem__() returns an ND-array containing `batch_size` data instances
ARGUMENTS
* ids (pandas.dataframe): table containing data instance names, and output variables
* data_path (string): path of HDF5 file
* batch_size (int): how many instances to read at a time
* shuffle (bool): randomized reading order
* ids_fn (function): returns an updated `ids` table, replacing `self.ids` at the end of each epoch
* process_fn (function): function applied to each data instance as it is read
* deterministic (None, int): random seed for shuffling order
* inputs (strings list): column names from `ids` containing data instance names, read from `data_path`
* inputs_df (strings list): column names from `ids`, returns values from the DataFrame itself
* outputs (strings list): column names from `ids`, returns values from the DataFrame itself
* verbose (bool): logging verbosity
* fixed_batches (bool): only return full batches, ignore the last incomplete batch if needed
* process_args (dictionary): dict of corresponding `ids` columns for `inputs`
containing arguments to pass to `process_fn`
* group_names (strings list): read only from specified groups, or from any if `group_names` is None
`group_names` are randomly sampled from meta-groups
i.e. when group_names = [[group_names_1], [group_names_2]]
* random_group (bool): read inputs from a random group for every data instance
"""
def __init__(self, ids, data_path, **args):
params_defa = Munch(ids = ids, data_path = data_path, deterministic = False,
batch_size = 32, shuffle = True, inputs = [],
inputs_df = None, outputs = [], memory_mapped = False,
verbose = False, fixed_batches = False, random_group = False,
process_fn = None, process_args = None, group_names = None,
input_shape = None, group_by = None, ids_fn = None)
check_keys_exist(args, params_defa)
params = updated_dict(params_defa, **args) # update only existing
params.process_args = params.process_args or {}
params.group_names = params.group_names or [None]
params.deterministic = {True: 42, False: None}.\
get(params.deterministic,
params.deterministic)
self.__dict__.update(**params) # set all as self.<param>
if self.verbose>1:
print('Initialized DataGeneratorHDF5')
self.on_epoch_end() # initialize indexes
def _data_generation(self, ids_batch):
"""Generates data containing batch_size samples"""
params = self
np.random.seed(params.deterministic)
y = self._read_data(ids_batch, params.outputs)
X_list = self._read_data(ids_batch, params.inputs_df)
assert isinstance(params.inputs, list),\
'Generator inputs/outputs must be of type list'
if isinstance(params.data_path, string_types):
with H5Helper(params.data_path, file_mode='r',
memory_mapped=params.memory_mapped) as h:
# group_names are randomly sampled from meta-groups
# i.e. when group_names = [[group_names1], [group_names2]]
group_names = params.group_names
if isinstance(group_names[0], list):
idx = np.random.randint(0, len(group_names))
group_names = group_names[idx]
# get data for each input and add it to X_list
for group_name in group_names:
for input_name in params.inputs:
# get data
names = list(ids_batch.loc[:,input_name])
if params.random_group:
data = h.read_data_random_group(names)
elif group_name is None:
data = h.read_data(names)
else:
data = h.read_data(names, group_names=[group_name])[0]
if data.dtype != np.float32:
data = data.astype(np.float32)
# column name for the arguments to `process_fn`
args_name = params.process_args.get(input_name, None)
# add to X_list
if params.process_fn not in [None, False]:
data_new = None
for i, row in enumerate(ids_batch.itertuples()):
arg = [] if args_name is None else [getattr(row,args_name)]
data_i = params.process_fn(data[i,...], *arg)
if data_new is None:
data_new = np.zeros((len(data),)+data_i.shape,
dtype=np.float32)
data_new[i,...] = data_i
X_list.append(data_new)
else:
X_list.append(data)
np.random.seed(None)
return (X_list, y)
class GeneratorStack(keras.utils.Sequence):
"""
Creates an aggregator generator that feeds from multiple generators.
"""
def __init__(self, generator_list):
self.gens = generator_list
def __len__(self):
"""Number of batches per epoch"""
return len(self.gens[0])
def __getitem__(self, index):
"""Generate one batch of data"""
X, y = [],[]
for g in self.gens:
X_, y_ = g[index]
X.extend(X_)
y.extend(y_)
return (X, y)