-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathtrain.py
executable file
·695 lines (662 loc) · 44.5 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
#!/usr/bin/env python
"""
Train a new model for event or relation detection.
"""
import sys, os
from Utils.InteractionXML.DeleteElements import getEmptyCorpus
import Utils.InteractionXML.Catenate as Catenate
import Utils.Stream as Stream
import Utils.Settings as Settings
import Utils.Parameters as Parameters
from Utils.Connection.Connection import getConnection
import Utils.STFormat.Compare
import Utils.InteractionXML.Subset
import shutil
import atexit
import types
import tempfile
from Core.Model import Model
from Detectors.StepSelector import StepSelector
from Detectors.Preprocessor import Preprocessor
from Detectors.StructureAnalyzer import StructureAnalyzer
from Detectors.EventDetector import EventDetector
from Evaluators import EvaluateInteractionXML
import copy
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
processUnmerging=None, processModifiers=None,
bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None,
classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None,
log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None,
folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
"""
Train a new model for event or relation detection.
@param output: A directory where output files will appear.
@param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
@param detector: a Detector object, or a string defining one to be imported
@param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
@param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
@param parse: The parse element name in the training interaction XML
@param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
@param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
@param bioNLPSTParams: Parameters controlling BioNLP ST format output.
@param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
@param exampleStyles: A parameter set for controlling example builders.
@param classifierParams: A parameter set for controlling classifiers.
@param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
@param deleteOutput: Remove an existing output directory
@param copyFrom: Copy an existing output directory for use as a template
@param log: An optional alternative name for the log file. None is for no logging.
@param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
@param omitSteps: step=substep parameters, where multiple substeps can be defined.
@param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
@param connection: A parameter set defining a local or remote connection for training the classifier
@param subset: A parameter set for making subsets of input files
"""
# Insert default arguments where needed
inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
processUnmerging = getDefinedBool(processUnmerging)
processModifiers = getDefinedBool(processModifiers)
# Initialize working directory
workdir(output, deleteOutput, copyFrom, log)
# Get task specific parameters
useKerasDetector = False
if detector != None and "keras" in detector.lower():
print >> sys.stderr, "Using a Keras Detector"
useKerasDetector = True
if detector.lower() == "keras":
detector = None
detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector,
bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
# Learn training settings from input files
detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)
# Get corpus subsets
getFolds(inputFiles, folds)
getSubsets(inputFiles, subset)
if task != None:
task = task.replace("-FULL", "")
if "." in task:
_, subTask = getSubTask(task)
if subTask != 3:
processModifiers = False
# Preprocess the corpus if required
if corpusPreprocessing != None:
preprocessor = Preprocessor(steps=corpusPreprocessing)
assert preprocessor.steps[0].name == "MERGE_SETS"
assert preprocessor.steps[-1].name == "DIVIDE_SETS"
preprocessedCorpusDir = os.path.join(output, "corpus")
#outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
#inputFiles = outputFiles
for setName in inputFiles.keys():
if inputFiles[setName] != None:
inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
# Define processing steps
selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
# Initialize the detector
detector, detectorName = getDetector(detector, evaluator=evaluator)
evaluator, evaluatorName = importClass(evaluator, "evaluator")
detector = detector() # initialize object
if evaluator != None:
print >> sys.stderr, "Using evaluator", evaluator.__name__
detector.evaluator = evaluator
detector.debug = debug
detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
#detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
#detector.stWriteScores = True # write confidence scores into additional st-format files
connection = getConnection(connection)
detector.setConnection(connection)
connection.debug = debug
if deleteOutput:
connection.clearWorkDir()
# Train
if selector.check("TRAIN"):
print >> sys.stderr, "----------------------------------------------------"
print >> sys.stderr, "------------------ Train Detector ------------------"
print >> sys.stderr, "----------------------------------------------------"
if not isinstance(detector, EventDetector):
detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
exampleStyles["examples"], classifierParams["examples"], parse, None, task,
fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
else:
detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
classifierParams["recall"], processUnmerging, processModifiers,
doFullGrid, task, parse, None,
fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
# Save the detector type
for model in [models["devel"], models["test"]]:
if model != None and os.path.exists(model):
model = Model(model, "a")
model.addStr("detector", detectorName)
if evaluatorName != None:
model.addStr("detector", evaluatorName)
if preprocessorParams != None:
preprocessor = Preprocessor()
model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
model.save()
model.close()
if selector.check("DEVEL"):
print >> sys.stderr, "----------------------------------------------------"
print >> sys.stderr, "------------ Check devel classification ------------"
print >> sys.stderr, "----------------------------------------------------"
#detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
if selector.check("EMPTY"):
# By passing an emptied devel set through the prediction system, we can check that we get the same predictions
# as in the DEVEL step, ensuring the model does not use leaked information.
print >> sys.stderr, "----------------------------------------------------"
print >> sys.stderr, "------------ Empty devel classification ------------"
print >> sys.stderr, "----------------------------------------------------"
#detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
removalScope = "non-given"
if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
removalScope = "all"
elif "Edge" in detector.__class__.__name__:
removalScope = "interactions"
detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
print >> sys.stderr, "*** Evaluate empty devel classification ***"
if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
else:
print >> sys.stderr, "No output file for evaluation"
if selector.check("TEST"):
print >> sys.stderr, "----------------------------------------------------"
print >> sys.stderr, "------------- Test set classification --------------"
print >> sys.stderr, "----------------------------------------------------"
if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
else:
#detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
if detector.bioNLPSTParams["convert"]:
extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz"
Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
# Stop logging
if log != None:
Stream.closeLog(log)
def setDictDefaults(dictionary, defaults):
if dictionary == None:
return defaults.copy()
for key in dictionary:
if dictionary[key] == "None":
dictionary[key] = None
for key in defaults:
if key not in dictionary:
dictionary[key] = defaults[key]
return dictionary
def getSteps(step, omitSteps, mainSteps):
# Determine substep to start from, for the main step from which processing starts
step = Parameters.get(step, mainSteps)
fromMainStep = None
fromSubStep = {} # The substep to start from, for the main step to start from
for mainStep in step.keys():
fromSubStep[mainStep] = step[mainStep] # the sub step to start from
if step[mainStep] != None:
assert fromMainStep == None # processing can start from one place only
fromMainStep = mainStep
if step[mainStep] == True:
fromSubStep[mainStep] = None
else:
assert type(step[mainStep]) in types.StringTypes # no list allowed, processing can start from one place only
# Determine steps to omit
omitSubSteps = {} # Skip these substeps. If the value is True, skip the entire main step.
omitMainSteps = []
omitSteps = Parameters.get(omitSteps, mainSteps)
for mainStep in omitSteps.keys():
omitSubSteps[mainStep] = omitSteps[mainStep]
if omitSteps[mainStep] == True:
omitMainSteps.append(mainStep)
omitSubSteps[mainStep] = None
# Initialize main step selector
if fromMainStep != None:
if fromSubStep[fromMainStep] != None:
print >> sys.stderr, "Starting process from step", fromMainStep + ", substep", fromSubStep[fromMainStep]
else:
print >> sys.stderr, "Starting process from step", fromMainStep
selector = StepSelector(mainSteps, fromStep=fromMainStep, omitSteps=omitMainSteps)
return selector, fromSubStep, omitSubSteps
def importClass(cls, category=""):
if cls == None:
return None, None
elif type(cls) in types.StringTypes:
className = cls
print >> sys.stderr, "Importing", category, cls
if cls.startswith("from"):
exec cls
cls = eval(cls.split(".")[-1])
else:
exec "from " + cls + " import " + cls.split(".")[-1]
cls = eval(cls.split(".")[-1])
else: # assume it is a class
className = cls.__name__
print >> sys.stderr, "Using", category, className
return cls, className
def getDetector(detector, model=None, evaluator=None):
# Get the detector
if detector == None:
assert model != None
model = Model(model, "r")
detector = model.getStr("detector")
model.close()
return importClass(detector, "detector")
def getSubsets(inputFiles, subset, outdir="training"):
for dataset in ("devel", "train", "test"):
if inputFiles[dataset] not in [None, "None"] and (subset[dataset] != None or subset["all"] != None):
fraction = subset[dataset]
if fraction == None:
fraction = subset["all"]
if outdir == None:
outdir = tempfile.mkdtemp()
outFileName = os.path.join(outdir, "subset_" + str(fraction) + "_" + str(subset["seed"]) + "_" + os.path.basename(inputFiles[dataset]))
if not os.path.exists(outFileName):
Utils.InteractionXML.Subset.getSubset(inputFiles[dataset], outFileName, float(fraction), subset["seed"])
inputFiles[dataset] = outFileName
def getFolds(inputFiles, folds, outdir="training"):
if folds["train"] == None or folds["devel"] == None:
return
#assert inputFiles["devel"] in [None, "None"]
#assert inputFiles["test"] in [None, "None"]
origTrainFile = inputFiles["train"]
for dataset in ("devel", "train", "test"):
currentFold = folds[dataset]
if currentFold == "None":
inputFiles[dataset] = None
continue
elif currentFold == None:
continue
if type(currentFold) in types.StringTypes:
currentFold = [currentFold]
idString = "_".join(currentFold)
idString = idString.replace("train", "t")
if outdir == None:
outdir = tempfile.mkdtemp()
outFileName = os.path.join(outdir, dataset + "-" + idString + ".xml")
if not os.path.exists(outFileName):
Utils.InteractionXML.Subset.getSubset(origTrainFile, outFileName, attributes={"set":currentFold})
inputFiles[dataset] = outFileName
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"):
# When using a template, always remove existing work directory
if copyFrom != None:
deleteIfExists = True
# Remove existing work directory, if requested to do so
if os.path.exists(path) and deleteIfExists:
print >> sys.stderr, "Output directory exists, removing", path
shutil.rmtree(path)
# Create work directory if needed
if not os.path.exists(path):
if copyFrom == None:
print >> sys.stderr, "Making output directory", path
os.makedirs(path)
else:
print >> sys.stderr, "Copying template from", options.copyFrom, "to", path
shutil.copytree(options.copyFrom, path)
else:
print >> sys.stderr, "Using existing output directory", path
# Remember current directory and switch to workdir
atexit.register(os.chdir, os.getcwd())
os.chdir(path)
# Open log (if a relative path, it goes under workdir)
if log != None:
Stream.openLog(log)
else:
print >> sys.stderr, "No logging"
return path
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False):
if detector == None:
print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
structureAnalyzer = StructureAnalyzer()
if not os.path.exists("training/structure.txt"):
datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
print >> sys.stderr, "input files:", datasets
structureAnalyzer.analyze(datasets)
print >> sys.stderr, structureAnalyzer.toString()
structureAnalyzer.save(None, "training/structure.txt")
else:
print >> sys.stderr, "Using existing analysis from training/structure.txt"
structureAnalyzer.load(None, "training/structure.txt")
# Choose detector
if detector == None:
if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
detector = "Detectors.EventDetector"
elif "ENTITY" in structureAnalyzer.targets:
detector = "Detectors.EntityDetector"
elif "INTERACTION" in structureAnalyzer.targets:
detector = "Detectors.EdgeDetector"
else:
assert False, structureAnalyzer.targets
if useKerasDetector and not "Keras" in detector:
detector = detector.replace("Detectors.", "Detectors.Keras")
print >> sys.stderr, "Using detector '" + str(detector) + "'"
# Set default parameters
cp = classifierParameters
if detector == "Detectors.EventDetector":
# Add common classifier parameters
if cp["examples"] != None:
cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"])
cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"])
cp["edge"] = Parameters.cat(cp["examples"], cp["edge"])
cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"])
cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging")
cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers")
cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges")
cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers")
cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters")
elif detector == "Detectors.EntityDetector":
cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities")
elif detector == "Detectors.EdgeDetector":
cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges")
elif detector == "Detectors.UnmergingDetector":
cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging")
#######################################################################
# Keras example styles
#######################################################################
if useKerasDetector:
task, subTask = getSubTask(task)
msg = "Keras example style"
#overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles}
overrideStyles = {"all":{}}
for key in exampleStyles:
overrideStyles[key] = {}
params = Parameters.get(exampleStyles[key])
if "override" in params:
exampleStyles[key] = None
overrideStyles[key] = params
overrideStyles[key].pop("override")
elif "override_all" in params:
exampleStyles[key] = None
overrideStyles["all"] = params
overrideStyles["all"].pop("override_all")
#exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None
print >> sys.stderr, "Override styles:", overrideStyles
if "EventDetector" in detector:
if task == "EPI11":
exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"])
else:
exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"])
if task in ["GE09", "GE11", "GE13"] and subTask == 1:
exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
else:
exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"])
exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"])
elif "EntityDetector" in detector:
if task == "DDI13T91":
exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"])
else:
exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"])
elif "EdgeDetector" in detector:
if "DDI" in task:
exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"])
elif task == "CP17":
exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"])
else:
exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"])
print >> sys.stderr, "Keras initial example styles:", exampleStyles
for key in exampleStyles:
if exampleStyles[key] != None:
exampleStyles[key] = Parameters.get(exampleStyles[key])
exampleStyles[key].update(overrideStyles[key])
exampleStyles[key].update(overrideStyles["all"])
exampleStyles[key] = Parameters.toString(exampleStyles[key])
print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key]
return detector
def getSubTask(task):
subTask = None
if "." in task:
task, subTask = task.split(".")
subTask = int(subTask)
return task, subTask
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams,
inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None, useKerasDetector=False):
if task != None:
print >> sys.stderr, "*** Defining training settings for task", task, "***"
fullTaskId = task
task, subTask = getSubTask(task)
if corpusDir == None:
corpusDir = Settings.CORPUS_DIR
print >> sys.stderr, "Loading corpus", task, "from", corpusDir
for dataset in ["devel", "train", "test"]:
if inputFiles[dataset] == None:
if task.startswith("DDI13") and task != "DDI13":
if dataset in ["devel", "train"]:
inputFiles[dataset] = os.path.join(corpusDir, "DDI13-train.xml")
elif dataset == "test":
if task.endswith("T91"):
inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.1.xml")
elif task.endswith("T92") or task.endswith("FULL"):
inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.2.xml")
elif task == "ID11" and dataset == "train":
inputFiles[dataset] = Catenate.catenate([os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"),
os.path.join(corpusDir, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True)
else:
inputFiles[dataset] = os.path.join(corpusDir, task.replace("-FULL", "") + "-"+dataset+".xml")
if inputFiles[dataset] == "skip":
inputFiles[dataset] = None
if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]):
fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset])
if os.path.exists(fullPath):
inputFiles[dataset] = fullPath
else:
inputFiles[dataset] = None
print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping."
assert inputFiles["train"] != None # at least training set must exist
# Example generation parameters
if detector == None:
if task == "CO11":
detector = "Detectors.CODetector"
elif task in ["BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"]:
detector = "Detectors.EventDetector"
elif task.startswith("DDI13"):
if task.endswith("T91"):
detector = "Detectors.EntityDetector"
elif task.endswith("T92") or task == "DDI13":
detector = "Detectors.EdgeDetector"
#######################################################################
# BioNLP Shared Task and preprocessing parameters
#######################################################################
if task == "BI11-FULL":
bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities
elif task == "REL11":
bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"])
elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"):
bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"])
elif task not in ["DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13", "CP17", "SEMEVAL10T8"]:
bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"])
else:
bioNLPSTParams = "skip"
#######################################################################
# Preprocessing parameters
#######################################################################
if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13"]:
Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
else: # parse only sentences where BANNER found an entity
Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
#######################################################################
# Example style parameters
#######################################################################
if not useKerasDetector:
# Example style parameters for single-stage tasks #####################
msg = "Single-stage example style / " + fullTaskId
if task == "REN11":
exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg)
elif task == "DDI11":
exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg)
elif task.startswith("DDI13"):
if task.endswith("T91"):
exampleStyles["examples"] = Parameters.cat("names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg)
elif task.endswith("T92") or task == "DDI13":
exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg)
elif task == "BI11":
exampleStyles["examples"] = Parameters.cat("bi_features", exampleStyles["examples"], msg)
elif task == "BB_EVENT_16":
exampleStyles["examples"] = Parameters.cat("keep_neg", exampleStyles["examples"], msg) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg)
elif task == "SDB16":
exampleStyles["examples"] = Parameters.cat("sdb_merge:sdb_features", exampleStyles["examples"], msg)
# Edge style ##########################################################
msg = "Edge example style / " + fullTaskId
if task in ["GE09", "GE11", "GE13"] and subTask == 1:
exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"], msg)
elif task in ["GE09", "GE11", "GE13"]:
exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg)
elif task == "REL11":
exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg)
elif task == "DDI11-FULL":
exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg)
elif task == "DDI13-FULL":
exampleStyles["edge"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg)
elif task == "CO11":
exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg)
elif task == "BI11-FULL":
exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg)
# Trigger style #######################################################
msg = "Trigger example style / " + fullTaskId
if task in ["GE09", "GE11", "GE13"] and subTask == 1:
exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg)
elif task in ["EPI11", "PC13"]:
exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg)
elif task == "BB11": # "bb_features:build_for_nameless:wordnet"
exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg)
elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet"
exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg)
elif task == "REL11":
exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg)
elif task in ["BI11-FULL", "DDI11-FULL"]:
exampleStyles["trigger"] = "names:build_for_nameless"
elif task == "DDI13-FULL":
exampleStyles["trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features"
elif task == "BB_EVENT_16-FULL":
exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg)
elif task in "BB_EVENT_NER_16":
exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg)
#######################################################################
# Classifier parameters
#######################################################################
if task == "DDI11":
classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
#elif task == "DDI13":
# classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
elif task == "CO11":
classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId)
classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId)
classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId)
elif task == "BB_EVENT_16":
classifierParameters["examples"] = Parameters.cat("c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId)
elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"):
classifierParameters["edge"] = Parameters.cat("c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId)
elif task == "SDB16":
classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId)
# Training fold parameters ############################################
if task.startswith("DDI13") and task != "DDI13":
#folds["devel"]=["train1", "train2", "train3", "train4"]
#folds["train"]=["train5", "train6", "train7", "train8", "train9"]
folds["devel"]=["train1", "train2", "train3"]
folds["train"]=["train4", "train5", "train6", "train7", "train8", "train9"]
return detector, bioNLPSTParams, preprocessorParams, folds
def getDefinedBool(string):
if string in (True, False): # already defined
return string
assert string in (None, "True", "False") # undefined or needs to be converted to bool
if string == None:
return None
elif string == "True":
return True
else:
return False
def getDefinedBoolOption(option, opt, value, parser):
if value == None:
setattr(parser.values, option.dest, True)
else:
setattr(parser.values, option.dest, getDefinedBool(value))
if __name__=="__main__":
# Import Psyco if available
try:
import psyco
psyco.full()
print >> sys.stderr, "Found Psyco, using"
except ImportError:
print >> sys.stderr, "Psyco not installed"
from optparse import OptionParser, OptionGroup
optparser = OptionParser(description="Train a new event/relation extraction model")
# main options
group = OptionGroup(optparser, "Main Options", "")
group.add_option("-t", "--task", default=None, dest="task", help="task number")
group.add_option("-p", "--parse", default="McCC", dest="parse", help="Parse XML element name")
group.add_option("-c", "--connection", default=None, dest="connection", help="")
optparser.add_option_group(group)
# input
group = OptionGroup(optparser, "Input Files", "If these are undefined, a task (-t) specific corpus file will be used")
group.add_option("--trainFile", default=None, dest="trainFile", help="")
group.add_option("--develFile", default=None, dest="develFile", help="")
group.add_option("--testFile", default=None, dest="testFile", help="")
group.add_option("--corpusDir", default=None, dest="corpusDir", help="Overrides the Settings.CORPUS_DIR value")
group.add_option("--corpusPreprocess", default=None, dest="corpusPreprocess", help="Preprocessor steps for the corpus")
optparser.add_option_group(group)
# output
group = OptionGroup(optparser, "Output Files", "Files created from training the detector")
group.add_option("-o", "--output", default=None, dest="output", help="Output directory for intermediate files")
group.add_option("--develModel", default="model-devel", dest="develModel", help="Model trained on 'trainFile', with parameters optimized on 'develFile'")
group.add_option("--testModel", default="model-test", dest="testModel", help="Model trained on 'trainFile'+'develFile', with parameters from 'develModel'")
optparser.add_option_group(group)
# Example builders
group = OptionGroup(optparser, "Detector to train", "")
group.add_option("--detector", default=None, dest="detector", help="the detector class to use")
group.add_option("--evaluator", default=None, dest="evaluator", help="change the detector's default evaluator")
#group.add_option("--singleStage", default=False, action="store_true", dest="singleStage", help="'detector' is a single stage detector")
#group.add_option("--noBioNLPSTFormat", default=False, action="store_true", dest="noBioNLPSTFormat", help="Do not output BioNLP Shared Task format version (a1, a2, txt)")
group.add_option("--bioNLPSTParams", default=None, dest="bioNLPSTParams", help="")
group.add_option("--preprocessorParams", default=None, dest="preprocessorParams", help="")
optparser.add_option_group(group)
# Example builder parameters
event = OptionGroup(optparser, "Event Detector Options (used when not using '--singleStage')", "")
single = OptionGroup(optparser, "Single Stage Detector Options (used when using '--singleStage')", "")
single.add_option("--exampleStyle", default=None, dest="exampleStyle", help="Single-stage detector example style")
event.add_option("-u", "--unmerging", default=None, action="callback", callback=getDefinedBoolOption, dest="unmerging", type="str", help="SVM unmerging")
event.add_option("-m", "--modifiers", default=None, action="callback", callback=getDefinedBoolOption, dest="modifiers", type="str", help="Train model for modifier detection")
event.add_option("--triggerStyle", default=None, dest="triggerStyle", help="Event detector trigger example style")
event.add_option("--edgeStyle", default=None, dest="edgeStyle", help="Event detector edge example style")
event.add_option("--unmergingStyle", default=None, dest="unmergingStyle", help="Event detector unmerging example style")
event.add_option("--modifierStyle", default=None, dest="modifierStyle", help="Event detector modifier example style")
# Classifier parameters
single.add_option("-e", "--exampleParams", default=None, dest="exampleParams", help="Single-stage detector parameters (or general multi-stage parameters)")
event.add_option("-r", "--triggerParams", default=None, dest="triggerParams", help="Trigger detector c-parameter values")
event.add_option("-a", "--recallAdjustParams", default=None, dest="recallAdjustParams", help="Recall adjuster parameter values")
event.add_option("-d", "--edgeParams", default=None, dest="edgeParams", help="Edge detector c-parameter values")
event.add_option("-n", "--unmergingParams", default=None, dest="unmergingParams", help="Unmerging c-parameter values")
event.add_option("-f", "--modifierParams", default=None, dest="modifierParams", help="Modifier c-parameter values")
event.add_option("--fullGrid", default=False, action="store_true", dest="fullGrid", help="Full grid search for parameters")
optparser.add_option_group(single)
optparser.add_option_group(event)
# Debugging and process control
debug = OptionGroup(optparser, "Debug and Process Control Options", "")
debug.add_option("--step", default=None, dest="step", help="Step to start processing from, with optional substep (STEP=SUBSTEP). Step values are TRAIN, DEVEL, EMPTY and TEST.")
debug.add_option("--omitSteps", default=None, dest="omitSteps", help="")
debug.add_option("--copyFrom", default=None, dest="copyFrom", help="Copy this directory as template")
debug.add_option("--log", default="log.txt", dest="log", help="Log file name")
debug.add_option("--noLog", default=False, action="store_true", dest="noLog", help="Do not keep a log file")
debug.add_option("--clearAll", default=False, action="store_true", dest="clearAll", help="Delete all files")
debug.add_option("--debug", default=False, action="store_true", dest="debug", help="More verbose output")
event.add_option("--subset", default=None, dest="subset", help="")
event.add_option("--folds", default=None, dest="folds", help="")
optparser.add_option_group(debug)
(options, args) = optparser.parse_args()
if options.testModel == "None":
options.testModel = None
assert options.output != None
if options.noLog: options.log = None
train(options.output, options.task, options.detector,
inputFiles={"devel":options.develFile, "train":options.trainFile, "test":options.testFile},
models={"devel":options.develModel, "test":options.testModel}, parse=options.parse,
processUnmerging=options.unmerging, processModifiers=options.modifiers,
bioNLPSTParams=options.bioNLPSTParams, preprocessorParams=options.preprocessorParams,
exampleStyles={"examples":options.exampleStyle, "trigger":options.triggerStyle, "edge":options.edgeStyle, "unmerging":options.unmergingStyle, "modifiers":options.modifierStyle},
classifierParams={"examples":options.exampleParams, "trigger":options.triggerParams, "recall":options.recallAdjustParams, "edge":options.edgeParams, "unmerging":options.unmergingParams, "modifiers":options.modifierParams},
doFullGrid=options.fullGrid, deleteOutput=options.clearAll, copyFrom=options.copyFrom,
log=options.log, step=options.step, omitSteps=options.omitSteps, debug=options.debug,
connection=options.connection, subset=options.subset, folds=options.folds, corpusDir=options.corpusDir, corpusPreprocessing=options.corpusPreprocess,
evaluator=options.evaluator)