/
mitie.py
881 lines (695 loc) · 37.4 KB
/
mitie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
import ctypes
import os
import platform
def _last_modified_time(filename):
if os.path.isfile(filename):
return os.path.getmtime(filename)
else:
return 0
try:
xrange
PY3 = False
except NameError: # Py3
xrange = range
PY3 = True
# Load the mitie shared library. We will look in a few places to see if we can find it.
# What we do depends on our platform
parent = os.path.dirname(os.path.realpath(__file__))
if os.name == 'nt':
# if on windows just look in the same folder as the mitie.py file and also in any
# subfolders that might have the appropriate 32 or 64 bit dlls, whichever is right for
# the version of python we are using.
arch = platform.architecture()
files = [parent + '/mitie']
if arch[0] == "32bit":
files.append(parent + '/win32/mitie')
else:
files.append(parent + '/win64/mitie')
times = [(_last_modified_time(f+".dll"), f) for f in files]
most_recent = max(times, key=lambda x: x[0])[1]
_f = ctypes.CDLL(most_recent)
else:
# On UNIX like platforms MITIE might be in any number of places. Check them all and
# pick the one with the most recent timestamp.
files = ([parent + '/libmitie.so', 'libmitie.so', 'libmitie.dylib',
parent + '/libmitie.dylib', '/usr/local/lib/libmitie.so',
'/usr/local/lib/libmitie.dylib', parent + "/build/libmitie.dylib"])
times = [(_last_modified_time(f), f) for f in files]
most_recent = max(times, key=lambda x: x[0])[1]
_f = ctypes.CDLL(most_recent)
_f.mitie_free.restype = None
_f.mitie_free.argtypes = ctypes.c_void_p,
_f.mitie_get_named_entity_tagstr.restype = ctypes.c_char_p
_f.mitie_get_named_entity_tagstr.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_get_num_possible_ner_tags.restype = ctypes.c_ulong
_f.mitie_get_num_possible_ner_tags.argtypes = ctypes.c_void_p,
_f.mitie_extract_entities.restype = ctypes.c_void_p
_f.mitie_extract_entities.argtypes = ctypes.c_void_p, ctypes.c_void_p
_f.mitie_extract_entities_with_extractor.restype = ctypes.c_void_p
_f.mitie_extract_entities_with_extractor.argtypes = ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p
_f.mitie_check_ner_pure_model.restype = ctypes.c_int
_f.mitie_check_ner_pure_model.argtypes = ctypes.c_char_p,
_f.mitie_load_named_entity_extractor.restype = ctypes.c_void_p
_f.mitie_load_named_entity_extractor.argtypes = ctypes.c_char_p,
_f.mitie_load_named_entity_extractor_pure_model.restype = ctypes.c_void_p
_f.mitie_load_named_entity_extractor_pure_model.argtypes = ctypes.c_char_p, ctypes.c_char_p
_f.mitie_load_named_entity_extractor_pure_model_without_feature_extractor.restype = ctypes.c_void_p
_f.mitie_load_named_entity_extractor_pure_model_without_feature_extractor.argtypes = ctypes.c_char_p,
_f.mitie_load_entire_file.restype = ctypes.c_void_p
_f.mitie_load_entire_file.argtypes = ctypes.c_char_p,
_f.mitie_ner_get_detection_position.restype = ctypes.c_ulong
_f.mitie_ner_get_detection_position.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_ner_get_detection_length.restype = ctypes.c_ulong
_f.mitie_ner_get_detection_length.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_ner_get_detection_tag.restype = ctypes.c_ulong
_f.mitie_ner_get_detection_tag.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_ner_get_detection_score.restype = ctypes.c_double
_f.mitie_ner_get_detection_score.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_ner_get_num_detections.restype = ctypes.c_ulong
_f.mitie_ner_get_num_detections.argtypes = ctypes.c_void_p,
_f.mitie_entities_overlap.restype = ctypes.c_int
_f.mitie_entities_overlap.argtypes = ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong
_f.mitie_save_named_entity_extractor.restype = ctypes.c_int
_f.mitie_save_named_entity_extractor.argtypes = ctypes.c_char_p, ctypes.c_void_p
_f.mitie_save_named_entity_extractor_pure_model.restype = ctypes.c_int
_f.mitie_save_named_entity_extractor_pure_model.argtypes = ctypes.c_char_p, ctypes.c_void_p
_f.mitie_extract_binary_relation.restype = ctypes.c_void_p
_f.mitie_extract_binary_relation.argtypes = (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_ulong,
ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong)
def to_bytes(string):
"""Encode the string in utf-8. If the string is already encoded (bytes in Python 3
or str in Python 2), return the string unmodified."""
if hasattr(string, 'encode'):
try:
temp = string.encode('utf-8')
string = temp
except:
pass
return string
def to_default_str_type(string):
"""Convert if needed the string to the default string type (encoded str in Python2
or decoded str in Python3."""
if PY3:
if hasattr(string, 'decode'):
string = string.decode('utf-8')
else:
if hasattr(string, 'encode'):
string = string.encode('utf-8')
return string
def _get_windowed_range(tokens, arg1, arg2):
"""returns an xrange that spans a range that includes the arg1 and arg2 ranges
along with an additional 5 tokens on each side, subject to the constraint that
the returned xrange does not go outside of tokens, where tokens is a list."""
winsize = 5
begin = min(min(arg1), min(arg2))
end = max(max(arg1), max(arg2))+1
if begin > winsize:
begin -= winsize
else:
begin = 0
end = min(end + winsize, len(tokens))
r = xrange(begin, end)
return r
def python_to_mitie_str_array(tokens, r=None):
"""Convert from a Python list of strings into MITIE's NULL terminated char** array type.
Note that the memory returned by this object is managed by Python and doesn't need to be
freed by the user.
r should be a range that indicates which part of tokens to convert. If r is not given
then it defaults to xrange(len(tokens)) which selects the entirety of tokens to convert.
"""
if r is None:
r = xrange(len(tokens))
ctokens = (ctypes.c_char_p*(len(r)+1))()
i = 0
for j in r:
if isinstance(tokens[j], tuple):
ctokens[i] = to_bytes(tokens[j][0])
else:
ctokens[i] = to_bytes(tokens[j])
i += 1
ctokens[i] = None
return ctokens
def _range_is_valid(l, range):
"""checks if each element of the range is a valid element of the list and returns True if this is the case."""
return 0 <= min(range) and max(range) < len(l)
def load_entire_file(filename):
filename = to_bytes(filename)
x = _f.mitie_load_entire_file(filename)
if x is None:
raise Exception("Unable to load file " + to_default_str_type(filename))
res = ctypes.string_at(x)
_f.mitie_free(x)
return res
def tokenize(string):
"""Split string into tokens and return them as a list."""
mitie_tokenize = _f.mitie_tokenize
mitie_tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
mitie_tokenize.argtypes = ctypes.c_char_p,
string = to_bytes(string)
tok = mitie_tokenize(string)
if tok is None:
raise Exception("Unable to tokenize string.")
i = 0
res = []
while tok[i] is not None:
res.append(tok[i])
i += 1
_f.mitie_free(tok)
return res
def tokenize_with_offsets(string):
"""Split string into tokens and return them as a list. Also, each element of the list
contains a tuple of the token text and the byte offset which indicates the position of the
first character in the token within the input string."""
mitie_tokenize = _f.mitie_tokenize_with_offsets
mitie_tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
mitie_tokenize.argtypes = ctypes.c_char_p, ctypes.POINTER(ctypes.POINTER(ctypes.c_ulong))
token_offsets = ctypes.POINTER(ctypes.c_ulong)()
string = to_bytes(string)
tok = mitie_tokenize(string, ctypes.byref(token_offsets))
if tok is None:
raise Exception("Unable to tokenize string.")
i = 0
res = []
while tok[i] is not None:
res.append((tok[i], token_offsets[i]))
i += 1
_f.mitie_free(tok)
_f.mitie_free(token_offsets)
return res
class named_entity_extractor:
def __init__(self, filename, fe_filename=None):
self.__mitie_free = _f.mitie_free
if isinstance(filename, ctypes.c_void_p):
# If we get here then it means we are using the "private" constructor used by
# the training tools to create a named_entity_extractor. In this case,
# filename is a pointer to a ner object.
self.__obj = filename
else:
filename = to_bytes(filename)
if _f.mitie_check_ner_pure_model(filename) == 0:
if fe_filename is None:
self.__obj = _f.mitie_load_named_entity_extractor_pure_model_without_feature_extractor(filename)
else:
self.__obj = _f.mitie_load_named_entity_extractor_pure_model(filename, fe_filename)
else:
self.__obj = _f.mitie_load_named_entity_extractor(filename)
if self.__obj is None:
raise Exception("Unable to load named entity extractor from " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
@property
def _obj(self):
return self.__obj
def get_possible_ner_tags(self):
num = _f.mitie_get_num_possible_ner_tags(self.__obj)
return [to_default_str_type(_f.mitie_get_named_entity_tagstr(self.__obj, i)) for i in xrange(num)]
def save_to_disk(self, filename, pure_model=False):
"""Save this object to disk. You recall it from disk with the following Python
code:
ner = named_entity_extractor(filename)
If you saved with pure_model==True, the saved file will NOT include a serialised feature extractor object.
This makes the file much smaller, but when you want to read from disk you also have to pass
the name of the feature extractor file you used when training the model, e.g.:
ner = named_entity_extractor(filename,fe_filename)
"""
filename = to_bytes(filename)
if (pure_model):
if (_f.mitie_save_named_entity_extractor_pure_model(filename, self.__obj) != 0):
raise Exception("Unable to save named_entity_extractor to the file " + to_default_str_type(filename));
else:
if (_f.mitie_save_named_entity_extractor(filename, self.__obj) != 0):
raise Exception("Unable to save named_entity_extractor to the file " + to_default_str_type(filename));
def extract_entities(self, tokens, feature_extractor=None):
tags = self.get_possible_ner_tags()
# Now extract the entities and return the results
if(feature_extractor is not None and isinstance(feature_extractor, total_word_feature_extractor)):
dets = _f.mitie_extract_entities_with_extractor(self.__obj, python_to_mitie_str_array(tokens), feature_extractor._obj)
else:
dets = _f.mitie_extract_entities(self.__obj, python_to_mitie_str_array(tokens))
if dets is None:
raise Exception("Unable to create entity detections.")
num = _f.mitie_ner_get_num_detections(dets)
temp = [(xrange(_f.mitie_ner_get_detection_position(dets, i),
_f.mitie_ner_get_detection_position(dets, i) + _f.mitie_ner_get_detection_length(dets, i)),
to_default_str_type(tags[_f.mitie_ner_get_detection_tag(dets, i)]),
_f.mitie_ner_get_detection_score(dets, i)
) for i in xrange(num)]
_f.mitie_free(dets)
return temp
def extract_binary_relation(self, tokens, arg1, arg2):
"""
requires
- arg1 and arg2 are range objects and they don't go outside the
range xrange(len(tokens)).
- arg1 and arg2 do not overlap
ensures
- returns a processed binary relation that describes the relation
given by the two relation argument positions arg1 and arg2. You
can pass the returned object to a binary_relation_detector to see
if it is an instance of a known relation type."""
arg1_start = min(arg1)
arg1_length = len(arg1)
arg2_start = min(arg2)
arg2_length = len(arg2)
if _f.mitie_entities_overlap(arg1_start, arg1_length, arg2_start, arg2_length) == 1:
raise Exception("Error, extract_binary_relation() called with overlapping entities: " + arg1 + ", " + arg2)
# we are going to crop out a window of tokens around the entities
r = _get_windowed_range(tokens, arg1, arg2)
arg1_start -= min(r)
arg2_start -= min(r)
ctokens = python_to_mitie_str_array(tokens, r)
rel = _f.mitie_extract_binary_relation(self.__obj, ctokens, arg1_start, arg1_length, arg2_start, arg2_length)
if rel is None:
raise Exception("Unable to create binary relation.")
return binary_relation(rel)
####################################################################################################
_f.mitie_load_binary_relation_detector.restype = ctypes.c_void_p
_f.mitie_load_binary_relation_detector.argtypes = ctypes.c_char_p,
_f.mitie_binary_relation_detector_name_string.restype = ctypes.c_char_p
_f.mitie_binary_relation_detector_name_string.argtypes = ctypes.c_void_p,
_f.mitie_classify_binary_relation.restype = ctypes.c_int
_f.mitie_classify_binary_relation.argtypes = ctypes.c_void_p, ctypes.c_void_p, ctypes.POINTER(ctypes.c_double)
_f.mitie_save_binary_relation_detector.restype = ctypes.c_int
_f.mitie_save_binary_relation_detector.argtypes = ctypes.c_char_p, ctypes.c_void_p
class binary_relation:
def __init__(self, obj):
self.__obj = obj
self.__mitie_free = _f.mitie_free
@property
def _obj(self):
return self.__obj
def __del__(self):
self.__mitie_free(self.__obj)
class binary_relation_detector:
def __init__(self, filename):
self.__mitie_free = _f.mitie_free
if isinstance(filename, ctypes.c_void_p):
# If we get here then it means we are using the "private" constructor used by
# the training tools to create a binary_relation_detector. In this case,
# filename is a pointer to a ner object.
self.__obj = filename
else:
filename = to_bytes(filename)
self.__obj = _f.mitie_load_binary_relation_detector(filename)
if self.__obj is None:
raise Exception("Unable to load binary relation detector from " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
def save_to_disk(self, filename):
"""Save this object to disk. You recall it from disk with the following Python
code:
ner = binary_relation_detector(filename)"""
filename = to_bytes(filename)
if _f.mitie_save_binary_relation_detector(filename, self.__obj) != 0:
raise Exception("Unable to save binary_relation_detector to the file " + to_default_str_type(filename))
def __str__(self):
return "binary_relation_detector: " + \
to_default_str_type(_f.mitie_binary_relation_detector_name_string(self.__obj))
def __repr__(self):
return "<binary_relation_detector: " + \
to_default_str_type(_f.mitie_binary_relation_detector_name_string(self.__obj)) + ">"
@property
def name_string(self):
return to_default_str_type(_f.mitie_binary_relation_detector_name_string(self.__obj))
def __call__(self, relation):
"""Classify a relation object. The input should have been produced by
named_entity_extractor.extract_binary_relation(). This function returns a classification score
and if this number is > 0 then the relation detector is indicating that the input relation
is a true instance of the type of relation this object detects."""
score = ctypes.c_double()
if _f.mitie_classify_binary_relation(self.__obj, relation._obj, ctypes.byref(score)) != 0:
raise Exception("Unable to classify binary relation. "
"The detector is incompatible with the NER object used for extraction.")
return score.value
##############################################################################
#### TRAINING API ###
##############################################################################
_f.mitie_add_ner_training_entity.restype = ctypes.c_int
_f.mitie_add_ner_training_entity.argtypes = ctypes.c_void_p, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_char_p
_f.mitie_add_ner_training_instance.restype = ctypes.c_int
_f.mitie_add_ner_training_instance.argtypes = ctypes.c_void_p, ctypes.c_void_p
_f.mitie_create_ner_trainer.restype = ctypes.c_void_p
_f.mitie_create_ner_trainer.argtypes = ctypes.c_char_p,
_f.mitie_create_ner_training_instance.restype = ctypes.c_void_p
_f.mitie_create_ner_training_instance.argtypes = ctypes.c_void_p,
_f.mitie_ner_trainer_get_beta.restype = ctypes.c_double
_f.mitie_ner_trainer_get_beta.argtypes = ctypes.c_void_p,
_f.mitie_ner_trainer_get_num_threads.restype = ctypes.c_ulong
_f.mitie_ner_trainer_get_num_threads.argtypes = ctypes.c_void_p,
_f.mitie_ner_trainer_set_beta.restype = None
_f.mitie_ner_trainer_set_beta.argtypes = ctypes.c_void_p, ctypes.c_double
_f.mitie_ner_trainer_set_num_threads.restype = None
_f.mitie_ner_trainer_set_num_threads.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_ner_trainer_size.restype = ctypes.c_ulong
_f.mitie_ner_trainer_size.argtypes = ctypes.c_void_p,
_f.mitie_ner_training_instance_num_entities.restype = ctypes.c_ulong
_f.mitie_ner_training_instance_num_entities.argtypes = ctypes.c_void_p,
_f.mitie_ner_training_instance_num_tokens.restype = ctypes.c_ulong
_f.mitie_ner_training_instance_num_tokens.argtypes = ctypes.c_void_p,
_f.mitie_overlaps_any_entity.restype = ctypes.c_int
_f.mitie_overlaps_any_entity.argtypes = ctypes.c_void_p, ctypes.c_ulong, ctypes.c_ulong
_f.mitie_train_named_entity_extractor.restype = ctypes.c_void_p
_f.mitie_train_named_entity_extractor.argtypes = ctypes.c_void_p,
class ner_training_instance:
def __init__(self, tokens):
self.__obj = _f.mitie_create_ner_training_instance(python_to_mitie_str_array(tokens))
self.__mitie_free = _f.mitie_free
if self.__obj is None:
raise Exception("Unable to create ner_training_instance. Probably ran out of RAM.")
def __del__(self):
self.__mitie_free(self.__obj)
@property
def _obj(self):
return self.__obj
@property
def num_tokens(self):
return _f.mitie_ner_training_instance_num_tokens(self.__obj)
@property
def num_entities(self):
return _f.mitie_ner_training_instance_num_entities(self.__obj)
def overlaps_any_entity(self, range):
"""Takes a xrange and reports if the range overlaps any entities already in this object."""
if len(range) == 0 or max(range) >= self.num_tokens:
raise Exception("Invalid range given to ner_training_instance.overlaps_any_entity()")
return _f.mitie_overlaps_any_entity(self.__obj, min(range), len(range)) == 1
def add_entity(self, range, label):
label = to_bytes(label)
if len(range) == 0 or max(range) >= self.num_tokens or min(range) < 0:
raise Exception("Invalid range given to ner_training_instance.overlaps_any_entity()")
if self.overlaps_any_entity(range):
raise Exception("Invalid range given to ner_training_instance.overlaps_any_entity(). "
"It overlaps an entity given to a previous call to add_entity().")
if _f.mitie_add_ner_training_entity(self.__obj, min(range), len(range), label) != 0:
raise Exception("Unable to add entity to training instance. Probably ran out of RAM.")
class ner_trainer(object):
def __init__(self, filename):
filename = to_bytes(filename)
self.__obj = _f.mitie_create_ner_trainer(filename)
self.__mitie_free = _f.mitie_free
if self.__obj is None:
raise Exception("Unable to create ner_trainer based on " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
@property
def size(self):
return _f.mitie_ner_trainer_size(self.__obj)
def add(self, instance):
if _f.mitie_add_ner_training_instance(self.__obj, instance._obj) != 0:
raise Exception("Unable to add training instance to ner_trainer. Probably ran out of RAM.")
@property
def beta(self):
return _f.mitie_ner_trainer_get_beta(self.__obj)
@beta.setter
def beta(self, value):
if value < 0:
raise Exception("Invalid beta value given. beta can't be negative.")
_f.mitie_ner_trainer_set_beta(self.__obj, value)
@property
def num_threads(self):
return _f.mitie_ner_trainer_get_num_threads(self.__obj)
@num_threads.setter
def num_threads(self, value):
_f.mitie_ner_trainer_set_num_threads(self.__obj, value)
def train(self):
if self.size == 0:
raise Exception("You can't call train() on an empty trainer.")
# Make the type be a c_void_p so the named_entity_extractor constructor will know what to do.
obj = ctypes.c_void_p(_f.mitie_train_named_entity_extractor(self.__obj))
if obj is None:
raise Exception("Unable to create named_entity_extractor. Probably ran out of RAM")
return named_entity_extractor(obj)
##############################################################################
_f.mitie_create_binary_relation_trainer.restype = ctypes.c_void_p
_f.mitie_create_binary_relation_trainer.argtypes = ctypes.c_char_p, ctypes.c_void_p
_f.mitie_binary_relation_trainer_num_positive_examples.restype = ctypes.c_ulong
_f.mitie_binary_relation_trainer_num_positive_examples.argtypes = ctypes.c_void_p,
_f.mitie_binary_relation_trainer_num_negative_examples.restype = ctypes.c_ulong
_f.mitie_binary_relation_trainer_num_negative_examples.argtypes = ctypes.c_void_p,
_f.mitie_add_positive_binary_relation.restype = ctypes.c_int
_f.mitie_add_positive_binary_relation.argtypes = (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_ulong,
ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong)
_f.mitie_add_negative_binary_relation.restype = ctypes.c_int
_f.mitie_add_negative_binary_relation.argtypes = (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_ulong,
ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong)
_f.mitie_binary_relation_trainer_get_beta.restype = ctypes.c_double
_f.mitie_binary_relation_trainer_get_beta.argtypes = ctypes.c_void_p,
_f.mitie_binary_relation_trainer_get_num_threads.restype = ctypes.c_ulong
_f.mitie_binary_relation_trainer_get_num_threads.argtypes = ctypes.c_void_p,
_f.mitie_binary_relation_trainer_set_beta.restype = None
_f.mitie_binary_relation_trainer_set_beta.argtypes = ctypes.c_void_p, ctypes.c_double
_f.mitie_binary_relation_trainer_set_num_threads.restype = None
_f.mitie_binary_relation_trainer_set_num_threads.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_train_binary_relation_detector.restype = ctypes.c_void_p
_f.mitie_train_binary_relation_detector.argtypes = ctypes.c_void_p,
class binary_relation_detector_trainer(object):
def __init__(self, relation_name, ner):
relation_name = to_bytes(relation_name)
self.__obj = _f.mitie_create_binary_relation_trainer(relation_name, ner._obj)
self.__mitie_free = _f.mitie_free
if self.__obj is None:
raise Exception("Unable to create binary_relation_detector_trainer")
def __del__(self):
self.__mitie_free(self.__obj)
@property
def num_positive_examples(self):
return _f.mitie_binary_relation_trainer_num_positive_examples(self.__obj)
@property
def num_negative_examples(self):
return _f.mitie_binary_relation_trainer_num_negative_examples(self.__obj)
def add_positive_binary_relation(self, tokens, arg1, arg2):
if len(arg1) == 0 or len(arg2) == 0 or not _range_is_valid(tokens, arg1) or not _range_is_valid(tokens, arg2):
raise Exception("One of the ranges given to this function was invalid.")
arg1_start = min(arg1)
arg1_length = len(arg1)
arg2_start = min(arg2)
arg2_length = len(arg2)
if _f.mitie_entities_overlap(arg1_start, arg1_length, arg2_start, arg2_length) == 1:
raise Exception("Error, add_positive_binary_relation() called "
"with overlapping entities: " + arg1 + ", " + arg2)
r = _get_windowed_range(tokens, arg1, arg2)
arg1_start -= min(r)
arg2_start -= min(r)
ctokens = python_to_mitie_str_array(tokens, r)
if _f.mitie_add_positive_binary_relation(self.__obj, ctokens, arg1_start, arg1_length,
arg2_start, arg2_length) != 0:
raise Exception("Unable to add positive binary relation to "
"binary_relation_detector_trainer.")
def add_negative_binary_relation(self, tokens, arg1, arg2):
if len(arg1) == 0 or len(arg2) == 0 or not _range_is_valid(tokens, arg1) or not _range_is_valid(tokens, arg2):
raise Exception("One of the ranges given to this function was invalid.")
arg1_start = min(arg1)
arg1_length = len(arg1)
arg2_start = min(arg2)
arg2_length = len(arg2)
if _f.mitie_entities_overlap(arg1_start, arg1_length, arg2_start, arg2_length) == 1:
raise Exception("Error, add_negative_binary_relation() "
"called with overlapping entities: " + arg1 + ", " + arg2)
r = _get_windowed_range(tokens, arg1, arg2)
arg1_start -= min(r)
arg2_start -= min(r)
ctokens = python_to_mitie_str_array(tokens, r)
if _f.mitie_add_negative_binary_relation(self.__obj, ctokens, arg1_start, arg1_length,
arg2_start, arg2_length) != 0:
raise Exception("Unable to add negative binary relation to binary_relation_detector_trainer.")
@property
def beta(self):
return _f.mitie_binary_relation_trainer_get_beta(self.__obj)
@beta.setter
def beta(self, value):
if value < 0:
raise Exception("Invalid beta value given. beta can't be negative.")
_f.mitie_binary_relation_trainer_set_beta(self.__obj, value)
@property
def num_threads(self):
return _f.mitie_binary_relation_trainer_get_num_threads(self.__obj)
@num_threads.setter
def num_threads(self, value):
_f.mitie_binary_relation_trainer_set_num_threads(self.__obj, value)
def train(self):
if self.num_positive_examples == 0 or self.num_negative_examples == 0:
raise Exception("You must give both positive and negative training examples before you call train().")
# Make the type be a c_void_p so the binary_relation_detector constructor will know what to do.
obj = ctypes.c_void_p(_f.mitie_train_binary_relation_detector(self.__obj))
if obj is None:
raise Exception("Unable to create binary_relation_detector. Probably ran out of RAM")
return binary_relation_detector(obj)
##############################################################################
_f.mitie_create_text_categorizer_trainer.restype = ctypes.c_void_p
_f.mitie_create_text_categorizer_trainer.argtypes = ctypes.c_char_p,
_f.mitie_add_text_categorizer_labeled_text.restype = ctypes.c_int
_f.mitie_add_text_categorizer_labeled_text.argtypes = ctypes.c_void_p, ctypes.c_void_p, ctypes.c_char_p
_f.mitie_check_text_categorizer_pure_model.restype = ctypes.c_int
_f.mitie_check_text_categorizer_pure_model.argtypes = ctypes.c_char_p,
_f.mitie_load_text_categorizer.restype = ctypes.c_void_p
_f.mitie_load_text_categorizer.argtypes = ctypes.c_char_p,
_f.mitie_load_text_categorizer_pure_model.restype = ctypes.c_void_p
_f.mitie_load_text_categorizer_pure_model.argtypes = ctypes.c_char_p, ctypes.c_char_p
_f.mitie_load_text_categorizer_pure_model_without_feature_extractor.restype = ctypes.c_void_p
_f.mitie_load_text_categorizer_pure_model_without_feature_extractor.argtypes = ctypes.c_char_p,
_f.mitie_save_text_categorizer.restype = ctypes.c_int
_f.mitie_save_text_categorizer.argtypes = ctypes.c_void_p,
_f.mitie_save_text_categorizer_pure_model.restype = ctypes.c_int
_f.mitie_save_text_categorizer_pure_model.argtypes = ctypes.c_void_p,
_f.mitie_text_categorizer_trainer_get_beta.restype = ctypes.c_double
_f.mitie_text_categorizer_trainer_get_beta.argtypes = ctypes.c_void_p,
_f.mitie_text_categorizer_trainer_get_num_threads.restype = ctypes.c_ulong
_f.mitie_text_categorizer_trainer_get_num_threads.argtypes = ctypes.c_void_p,
_f.mitie_text_categorizer_trainer_set_beta.restype = None
_f.mitie_text_categorizer_trainer_set_beta.argtypes = ctypes.c_void_p, ctypes.c_double
_f.mitie_text_categorizer_trainer_set_num_threads.restype = None
_f.mitie_text_categorizer_trainer_set_num_threads.argtypes = ctypes.c_void_p, ctypes.c_ulong
_f.mitie_text_categorizer_trainer_size.restype = ctypes.c_ulong
_f.mitie_text_categorizer_trainer_size.argtypes = ctypes.c_void_p,
_f.mitie_train_text_categorizer.restype = ctypes.c_void_p
_f.mitie_train_text_categorizer.argtypes = ctypes.c_void_p,
_f.mitie_categorize_text.restype = ctypes.c_ulong
_f.mitie_categorize_text.argtypes = (ctypes.c_void_p, ctypes.c_void_p,
ctypes.POINTER(ctypes.POINTER(ctypes.c_char_p)), ctypes.POINTER(ctypes.c_double))
_f.mitie_categorize_text_with_extractor.restype = ctypes.c_ulong
_f.mitie_categorize_text_with_extractor.argtypes = (ctypes.c_void_p, ctypes.c_void_p,
ctypes.POINTER(ctypes.POINTER(ctypes.c_char_p)), ctypes.POINTER(ctypes.c_double), ctypes.c_void_p)
class text_categorizer:
def __init__(self, filename, fe_filename=None):
self.__mitie_free = _f.mitie_free
if isinstance(filename, ctypes.c_void_p):
self.__obj = filename
else:
filename = to_bytes(filename)
if _f.mitie_check_text_categorizer_pure_model(filename) == 0:
if fe_filename is None:
self.__obj = _f.mitie_load_text_categorizer_pure_model_without_feature_extractor(filename)
else:
fe_filename = to_bytes(fe_filename)
self.__obj = _f.mitie_load_text_categorizer_pure_model(filename, fe_filename)
else:
self.__obj = _f.mitie_load_text_categorizer(filename)
if self.__obj is None:
raise Exception("Unable to load text_categorizer detector from " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
def save_to_disk(self, filename,pure_model=False):
"""Save this object to disk. You recall it from disk with the following Python
code:
tcat = text_categorizer(filename)
If you saved with pure_model==True, the saved file will NOT include a serialised feature extractor object.
This makes the file much smaller, but when you want to read from disk you also have to pass
the name of the feature extractor file you used when training the model, e.g.:
tcat = text_categorizer(filename,fe_filename)
"""
filename = to_bytes(filename)
if (pure_model):
if (_f.mitie_save_text_categorizer_pure_model(filename, self.__obj) != 0):
raise Exception("Unable to save text_categorizer to the file " + to_default_str_type(filename));
else:
if (_f.mitie_save_text_categorizer(filename, self.__obj) != 0):
raise Exception("Unable to save text_categorizer to the file " + to_default_str_type(filename));
def __call__(self, tokens, feature_extractor=None):
"""Categorise a piece of text. The input tokens should have been produced by
something like tokenize(). This function returns a predicted label and a confidence score."""
score = ctypes.c_double()
label = ctypes.POINTER(ctypes.c_char_p)()
ctokens = python_to_mitie_str_array(tokens)
if (feature_extractor is not None and isinstance(feature_extractor, total_word_feature_extractor)) :
if _f.mitie_categorize_text_with_extractor(self.__obj, ctokens, ctypes.byref(label), ctypes.byref(score), feature_extractor._obj) != 0:
raise Exception("Unable to classify text.")
else:
if _f.mitie_categorize_text(self.__obj, ctokens, ctypes.byref(label), ctypes.byref(score)) != 0:
raise Exception("Unable to classify text.")
label = ctypes.cast(label, ctypes.c_char_p)
_label, _score = label.value, score.value
_f.mitie_free(label)
return to_default_str_type(_label), _score
class text_categorizer_trainer(object):
def __init__(self, filename):
filename = to_bytes(filename)
self.__obj = _f.mitie_create_text_categorizer_trainer(filename)
self.__mitie_free = _f.mitie_free
if self.__obj is None:
raise Exception("Unable to create text_categorizer_trainer based on " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
@property
def size(self):
return _f.mitie_text_categorizer_trainer_size(self.__obj)
def add_labeled_text(self, tokens, label):
label = to_bytes(label)
ctokens = python_to_mitie_str_array(tokens)
if _f.mitie_add_text_categorizer_labeled_text(self.__obj, ctokens, label) != 0:
raise Exception("Unable to add labeled text to training instance. Probably ran out of RAM.")
@property
def beta(self):
return _f.mitie_text_categorizer_trainer_get_beta(self.__obj)
@beta.setter
def beta(self, value):
if value < 0:
raise Exception("Invalid beta value given. beta can't be negative.")
_f.mitie_text_categorizer_trainer_set_beta(self.__obj, value)
@property
def num_threads(self):
return _f.mitie_text_categorizer_trainer_get_num_threads(self.__obj)
@num_threads.setter
def num_threads(self, value):
_f.mitie_text_categorizer_trainer_set_num_threads(self.__obj, value)
def train(self):
if self.size == 0:
raise Exception("You can't call train() on an empty trainer.")
obj = ctypes.c_void_p(_f.mitie_train_text_categorizer(self.__obj))
if obj is None:
raise Exception("Unable to create text_categorizer. Probably ran out of RAM")
return text_categorizer(obj)
##############################################################################
_f.mitie_load_total_word_feature_extractor.restype = ctypes.c_void_p
_f.mitie_load_total_word_feature_extractor.argtypes = ctypes.c_char_p,
_f.mitie_total_word_feature_extractor_fingerprint.restype = ctypes.c_ulong
_f.mitie_total_word_feature_extractor_fingerprint.argtypes = ctypes.c_void_p,
_f.mitie_total_word_feature_extractor_num_dimensions.restype = ctypes.c_ulong
_f.mitie_total_word_feature_extractor_num_dimensions.argtypes = ctypes.c_void_p,
_f.mitie_total_word_feature_extractor_num_words_in_dictionary.restype = ctypes.c_ulong
_f.mitie_total_word_feature_extractor_num_words_in_dictionary.argtypes = ctypes.c_void_p,
class total_word_feature_extractor:
def __init__(self, filename):
filename = to_bytes(filename)
self.__mitie_free = _f.mitie_free
if isinstance(filename, ctypes.c_void_p):
self.__obj = filename
else:
self.__obj = _f.mitie_load_total_word_feature_extractor(filename)
if self.__obj is None:
raise Exception("Unable to load total_word_feature_extractor detector from " + to_default_str_type(filename))
def __del__(self):
self.__mitie_free(self.__obj)
@property
def _obj(self):
return self.__obj
@property
def fingerprint(self):
return _f.mitie_total_word_feature_extractor_fingerprint(self.__obj)
@property
def num_dimensions(self):
return _f.mitie_total_word_feature_extractor_num_dimensions(self.__obj)
@property
def num_words_in_dictionary(self):
return _f.mitie_total_word_feature_extractor_num_words_in_dictionary(self.__obj)
def get_feature_vector(self, word):
word = to_bytes(word)
num_dimensions = self.num_dimensions
result = (ctypes.c_float*num_dimensions)()
_f.mitie_total_word_feature_extractor_get_feature_vector.restype = ctypes.c_int
_f.mitie_total_word_feature_extractor_get_feature_vector.argtypes = ctypes.c_void_p, ctypes.c_char_p, ctypes.POINTER(ctypes.c_float*num_dimensions)
if _f.mitie_total_word_feature_extractor_get_feature_vector(self.__obj, word, ctypes.byref(result)) != 0:
raise Exception("Unable to get feature vector.")
_result = [result[i] for i in xrange(num_dimensions)]
return _result
def get_words_in_dictionary(self):
_f.mitie_total_word_feature_extractor_get_words_in_dictionary.restype = ctypes.POINTER(ctypes.c_char_p)
_f.mitie_total_word_feature_extractor_get_words_in_dictionary.argtypes = ctypes.c_void_p,
words = _f.mitie_total_word_feature_extractor_get_words_in_dictionary(self.__obj)
if words is None:
raise Exception("Unable to get words in dictionary.")
i = 0
res = []
while words[i] is not None:
res.append(words[i])
i += 1
_f.mitie_free(words)
return res