-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
77 lines (60 loc) · 2.22 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests, argparse, re, urllib, logging, os, sys
from bs4 import BeautifulSoup
# http://dl3.uploadfdl.com/files/Serial/SpongeBob/S01/1080p x265 10bit/
'''
django url validation regex:
print(re.match(regex, "http://www.example.com") is not None) # True
print(re.match(regex, "example.com") is not None) # False
'''
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
'''
Setting up the command line parsers here
'''
parser = argparse.ArgumentParser(description='Enter URL of page to download videos from the page')
parser.add_argument('url', help='url of the webpage')
parser.add_argument('ext', help='extension of the file in that webpage')
'''
logging config
'''
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
def get_abs_url(start, append):
if start.endswith('/'):
start = start[:-1]
if append.startswith('/'):
append = append[:-1]
return f'{start}/{append}'
def save_file(file_url, fname):
r = requests.get(file_url, stream=True)
file = open(fname, 'wb')
logging.info(f'Downloading file: {fname}')
for chunk in r.iter_content(chunk_size=1024):
# writing one chunk at a time to file
if chunk:
file.write(chunk)
def main():
args = parser.parse_args()
# print(args.url)
if re.match(regex, args.url) is None:
raise Exception('Invalid URL')
web_page = requests.get(args.url)
if web_page.status_code is not 200:
raise Exception(f'The GET request to URL failed with a status_code of {web_page.status_code}')
res_soup = BeautifulSoup(web_page.text, 'html.parser')
file_links = list()
for link in res_soup.find_all('a'):
href = link.get('href')
if href.endswith(args.ext):
file_name = urllib.parse.unquote(href)
if os.path.isfile(file_name):
logging.info(f'File already downloaded, skipping file: {file_name}')
continue
save_file(get_abs_url(args.url, href), file_name)
file_links.append(get_abs_url(args.url, href))
if __name__ == '__main__':
main()