-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
162 lines (121 loc) · 6.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
###############################################################################################
#
# Welcome to PubCrawl. This is the main file for the library.
#
###############################################################################################
from termcolor import colored
import argparse
import os
from helpers.cli_loader import load_bar
from datasources.arxiv import *
from datasources.pdfs import *
if __name__ == "__main__":
# Welcome message
print(colored("Welcome to PubCrawl!", "green", attrs=["bold"]))
print(colored("--------------------", "green", attrs=["bold"]))
print("")
# Instantiate the parser
parser = argparse.ArgumentParser(description="PubCrawl: A Python library for crawling and cleaning scientific publications.")
# Add the arguments
parser.add_argument("-s", "--source", type=str, help="Choose the datasource. Example: -s arxiv", required=True)
parser.add_argument("-f", "--file", type=str, help="Load the arXiv dataset from a JSON file. Use the Kaggle arXiv Dataset JSON. Example: -f arxiv-metadata-oai-snapshot.json")
parser.add_argument("-c", "--category", type=str, help='Filter the arXiv dataset by category. Delimit multiple categories AND, OR. Be sure to use quotes. Example: -c "cs.AI AND cs.CL"')
parser.add_argument("-l", "--local_PDFS", type=str, help="Use local PDFs and provide a PDF dicrectory instead of downloading them. Example: -l ./PDFs")
parser.add_argument("-p", "--process", action="store_true", help="Process the arXiv dataset: PDF2TXT, Text Cleaning, ... Example: -p")
parser.add_argument("-g", "--storage_size", type=int, help="Set the maximum storage size for the arXiv dataset download. Lower storage means longer processing. [GB] Example: -s 100")
parser.add_argument("-r", "--rows", type=int, help="Set the number of rows to be processed. Example: -r 1000")
# Execute the parse_args() method
args = parser.parse_args()
if args.source == "arxiv":
# Integrity checks
if not args.file:
print(colored("Please provide a valid arXiv metdatata JSON file.", "red"))
os._exit(1)
if not os.path.isfile(args.file):
print(colored("The provided file does not exist.", "red"))
os._exit(1)
if args.storage_size and not args.process:
print(colored("You have provided a storage limit but do not wish to process the PDFs. This will result in an error.", "red"))
os._exit(1)
if args.local_PDFS and not os.path.isdir(args.local_PDFS):
print(colored("The provided directory does not exist.", "red"))
os._exit(1)
# Print the chosen arguments
print("")
print(colored("✓ You have chosen the arXiv Download.", "yellow"))
print(colored("✓ The file you have provided is valid.", "yellow"))
if args.category:
print(colored("✓ You have chosen the category / categories: {}".format(args.category), "yellow"))
if args.process:
print(colored("✓ You have chosen to process the data afterwards.", "yellow"))
if args.storage_size:
print(colored("✓ You have chosen a storage size of {} GB.".format(args.storage_size), "yellow"))
print("")
print(colored("Loading the Metadata...", "green", attrs=["bold"]))
print("")
# Load the arXiv JSON and filter for the chosen category
with load_bar(colored("Filtering arXiv Metadata JSON by category...", "yellow")):
arxiv_metadata_df = preprocess(arxiv_kaggle_file=args.file, arxiv_category=args.category, arxiv_rows=args.rows)
print("")
print(colored("Successfully loaded the arXiv metadata and saved it to arxiv_metadata.json.", "green", attrs=["bold"]))
print("")
if args.local_PDFS:
print("")
print(colored("✓ You have chosen to use local PDFs.", "yellow"))
print(colored("✓ The directory you have provided is valid.", "yellow"))
print("")
print(colored("Loading the PDFs...", "green", attrs=["bold"]))
print("")
# Load the local PDFs
with load_bar(colored("Loading local PDFs...", "yellow")):
arxiv_metadata_df = process_local_pdfs(arxiv_metadata_df, arxiv_local_PDF_dir=args.local_PDFS)
print("")
print(colored("Successfully loaded the local PDFs.", "green", attrs=["bold"]))
print("")
# Download the arXiv dataset
with load_bar(colored("Downloading arXiv PDFs from GCP", "yellow")):
download_success = download(arxiv_metadata_df, arxiv_storage_size=args.storage_size, arxiv_process=args.process)
if download_success:
if args.process:
print("")
print(colored("Successfully processed all PDFs to arxiv_fulltext.json", "green", attrs=["bold"]))
print("")
else:
print("")
print(colored("Successfully downloaded all PDFs to ./tmp/arxiv_pdf", "green", attrs=["bold"]))
print("")
else:
print(colored("An error occured during the download and processing phase.", "red"))
os._exit(1)
print(colored("Thank you for using PubCrawl. Good bye!", "green", attrs=["bold"]))
print(colored("--------------------", "green", attrs=["bold"]))
print("")
elif args.source == "pdfs":
# Integrity checks
if args.local_PDFS and not os.path.isdir(args.local_PDFS):
print(colored("The provided directory does not exist.", "red"))
os._exit(1)
# Check if there are pdf files in the directory
if args.local_PDFS:
pdfs = [f for f in os.listdir(args.local_PDFS) if f.endswith(".pdf")]
if len(pdfs) == 0:
print(colored("The provided directory does not contain any PDFs.", "red"))
os._exit(1)
if not args.process:
print(colored("You have not chosen to process the PDFs. This will result in an error. Please add the flag -p.", "red"))
os._exit(1)
# Print the chosen arguments
print("")
print(colored("✓ You have chosen to process local PDFs.", "yellow"))
print(colored("✓ The directory you have provided is valid.", "yellow"))
print(colored("✓ The directory contains valid PDF files.", "yellow"))
print("")
# Load the local PDFs
with load_bar(colored("Loading local PDFs...", "yellow")):
processed_pdfs = local_pdfs(pdf_dir=args.local_PDFS)
print("")
print(colored("Successfully loaded the local PDFs.", "green", attrs=["bold"]))
print("")
else:
print(colored("Please choose a valid datasource.", "red"))
os._exit(1)