-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentence_search.py
115 lines (100 loc) · 4 KB
/
sentence_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import json
import os
import random
import re
import sys
import tqdm
def convert_sentence(list_sentences, max_words=20): # [{'sentence', 'normalised', 'paper_id'}]
for dict_sentence in list_sentences:
sentence=dict_sentence['sentence']
sentence=sentence.replace('CITE', ' --citation-- ')
sentence=sentence.lower()
sentence=re.sub(r'[\n\t]', ' ', sentence)
sentence=re.sub(r'[^a-z0-9\-_ ]', '', sentence)
sentence=re.sub(r'\d+', '[N]', sentence)
sentence=re.sub(r'(--citation-- +)+', '[CITATION]', sentence)
sentence=re.sub(r' +', ' ', sentence)
sentence=re.sub(r'^ +| +$', '', sentence)
if len(sentence.split(' '))<=max_words:
dict_sentence['normalised']=sentence
return None
def search_FE(FE, list_sentences, max_sentences=5):
sentences=[]
FE=' '+FE+' '
FE=FE.replace('*', '.+?')
for sentence in random.sample(list_sentences, len(list_sentences)):
if 'normalised' not in sentence:
continue
if re.search(FE, ' '+sentence['normalised']+' '):
sentences.append(sentence)
if len(sentences)==max_sentences:
break
return sentences
def PMC_select_journal(paper_dir, journal_list_path, journal_title): # FEsはFEのlist
sentences=[]
with open(journal_list_path) as f:
journal_list=json.load(f)
if journal_title in journal_list:
for paper_path in journal_list[journal_title]:
with open(os.path.join(paper_dir, paper_path+'.json')) as f:
paper=json.load(f)
sentences.extend([{'sentence': sentence, 'paper_id': paper_path} for section in paper['data'] for sentence in paper['data'][section]])
return sentences
def load_FEs(path, discipline):
with open(path) as f:
j=json.load(f)
FEs=[fe for cf in j[discipline] for fe in j[discipline][cf]]
return FEs
def make_pairs(FEs, sentences, max_sentences=5):
results={}
for FE in tqdm.tqdm(FEs):
result=search_FE(FE, sentences, max_sentences=max_sentences)
results[FE]=result
return results
def PMC_main():
parser=argparse.ArgumentParser()
parser.add_argument('--paper-dir')
parser.add_argument('--journal-list')
parser.add_argument('--title')
parser.add_argument('--fe')
parser.add_argument('--discipline')
parser.add_argument('--output')
args=parser.parse_args()
sentences=PMC_select_journal(args.paper_dir, args.journal_list, args.title)
convert_sentence(sentences)
FEs=load_FEs(args.fe, args.discipline)
results=make_pairs(FEs, sentences)
with open(args.output, mode='w') as f:
json.dump(results, f, ensure_ascii=False, indent="\t")
def main(N, M=8):
directory=sys.argv[1]
disciplines=['CL', 'Chem', 'Onc', 'Psy']
disciplines=['Onc', 'Psy']
sections=['introduction', 'methods', 'results', 'discussion']
for disc in disciplines:
sentences=[]
for sec in sections:
with open(os.path.join(directory, disc+'.'+sec)) as f:
for n, l in enumerate(f):
if n%M!=N:
continue
sent_id, c, conf, sentence=l.split("\t")
if 'PMC' in sent_id:
paper_id=re.search(r'PMC\d+', sent_id).group()
else:
paper_id=sent_id.split('_')[0]
sentence=sentence.replace('cite-', 'CITE')
sentence=re.sub(r'CITE-p-\d+-\d+-\d+', 'CITE', sentence)
sentences.append({
'sentence': sentence.strip(),
'paper_id': paper_id
})
convert_sentence(sentences)
FEs=load_FEs(sys.argv[2], disc)
results=make_pairs(FEs, sentences, 1)
with open(os.path.join(sys.argv[3], disc+str(N)+'.json'), mode='w') as f:
json.dump(results, f, ensure_ascii=False, indent="\t")
if __name__=='__main__':
main(int(sys.argv[4]))
# PMC_main()