First add for tested ListDownloader
This commit is contained in:
commit
3f9681b621
65
README.md
Normal file
65
README.md
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
ListDownloader
|
||||||
|
====================
|
||||||
|
|
||||||
|
About
|
||||||
|
--------------------
|
||||||
|
This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
(Installation was prepared for and tested with Debian Jessie.)
|
||||||
|
|
||||||
|
You can install the package with pip using
|
||||||
|
|
||||||
|
# pip install listdownloader
|
||||||
|
|
||||||
|
Or you can create the installation package yourself from the source using
|
||||||
|
|
||||||
|
python3 setup.py sdist
|
||||||
|
|
||||||
|
and then use pip to install the package that will be built in the directory `dist`:
|
||||||
|
|
||||||
|
# pip3 install listdownloader-x.y.z.tar.gz
|
||||||
|
|
||||||
|
where x.y.z is the current version of the program.
|
||||||
|
|
||||||
|
The program installs the package listdownloader and a script file for usage.
|
||||||
|
|
||||||
|
Running the script and using the package
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
The script can be executed (globally) using:
|
||||||
|
|
||||||
|
$ downloadlist.py -f file.txt -d destination -t threads -l lines
|
||||||
|
|
||||||
|
where:
|
||||||
|
`file.txt` is the file name/path with the list of URLs to be downloaded
|
||||||
|
`destination` is the path, to which the files should be downloaded
|
||||||
|
`threads` is the number of processes to be used to download the URLs simultaneously
|
||||||
|
`lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
|
||||||
|
|
||||||
|
You may use the package in your own scripts by importing it:
|
||||||
|
|
||||||
|
import listdownloader
|
||||||
|
|
||||||
|
then you can download a list of files using:
|
||||||
|
|
||||||
|
listdownloader.download_files(URLs, destination, num_threads)
|
||||||
|
|
||||||
|
where:
|
||||||
|
`URLs` is a list of the URLs to be downloaded
|
||||||
|
`destination` is a string with the path, at which the files have to be saved
|
||||||
|
`num_threads` is the number of threads/processes to use for the download.
|
||||||
|
|
||||||
|
You can also download a single file using the function:
|
||||||
|
|
||||||
|
listdownloader.download_file(URL, destination)
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
MPL
|
||||||
|
|
||||||
|
About
|
||||||
|
-----
|
||||||
|
This script was written by Samer Afach, samer@afach.de for test purposes.
|
65
README.txt
Normal file
65
README.txt
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
ListDownloader
|
||||||
|
====================
|
||||||
|
|
||||||
|
About
|
||||||
|
--------------------
|
||||||
|
This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
(Installation was prepared for and tested with Debian Jessie.)
|
||||||
|
|
||||||
|
You can install the package with pip using
|
||||||
|
|
||||||
|
# pip install listdownloader
|
||||||
|
|
||||||
|
Or you can create the installation package yourself from the source using
|
||||||
|
|
||||||
|
python3 setup.py sdist
|
||||||
|
|
||||||
|
and then use pip to install the package that will be built in the directory `dist`:
|
||||||
|
|
||||||
|
# pip3 install listdownloader-x.y.z.tar.gz
|
||||||
|
|
||||||
|
where x.y.z is the current version of the program.
|
||||||
|
|
||||||
|
The program installs the package listdownloader and a script file for usage.
|
||||||
|
|
||||||
|
Running the script and using the package
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
The script can be executed (globally) using:
|
||||||
|
|
||||||
|
$ downloadlist.py -f file.txt -d destination -t threads -l lines
|
||||||
|
|
||||||
|
where:
|
||||||
|
`file.txt` is the file name/path with the list of URLs to be downloaded
|
||||||
|
`destination` is the path, to which the files should be downloaded
|
||||||
|
`threads` is the number of processes to be used to download the URLs simultaneously
|
||||||
|
`lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
|
||||||
|
|
||||||
|
You may use the package in your own scripts by importing it:
|
||||||
|
|
||||||
|
import listdownloader
|
||||||
|
|
||||||
|
then you can download a list of files using:
|
||||||
|
|
||||||
|
listdownloader.download_files(URLs, destination, num_threads)
|
||||||
|
|
||||||
|
where:
|
||||||
|
`URLs` is a list of the URLs to be downloaded
|
||||||
|
`destination` is a string with the path, at which the files have to be saved
|
||||||
|
`num_threads` is the number of threads/processes to use for the download.
|
||||||
|
|
||||||
|
You can also download a single file using the function:
|
||||||
|
|
||||||
|
listdownloader.download_file(URL, destination)
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
MPL
|
||||||
|
|
||||||
|
About
|
||||||
|
-----
|
||||||
|
This script was written by Samer Afach, samer@afach.de for test purposes.
|
33
bin/downloadlist.py
Normal file
33
bin/downloadlist.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import listdownloader
|
||||||
|
import argparse
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-f", "--file", type=str, dest="filepath", default="",
|
||||||
|
help="Text file that contains links to download, line by line", required=True)
|
||||||
|
parser.add_argument("-d", "--destination", type=str, dest="dest", default="",
|
||||||
|
help="Destination path to download the files", required=True)
|
||||||
|
parser.add_argument("-t", "--threads", type=int, dest="threads", default=0,
|
||||||
|
help="Number of threads/processes to be used")
|
||||||
|
parser.add_argument("-l", "--lines", type=int, dest="numlines", default=0,
|
||||||
|
help="Number of lines to be read as a chunk. 0=Read the whole file.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.numlines <= 0: # load the whole file
|
||||||
|
with open(args.filepath) as f:
|
||||||
|
file_lines = f.readlines()
|
||||||
|
listdownloader.download_files(file_lines, args.dest, args.threads)
|
||||||
|
|
||||||
|
else: # load parts of the file
|
||||||
|
with open(args.filepath, 'r') as infile:
|
||||||
|
while True:
|
||||||
|
file_lines = list(islice(infile, args.numlines))
|
||||||
|
if len(file_lines) > 0:
|
||||||
|
print(file_lines)
|
||||||
|
listdownloader.download_files(file_lines, args.dest, args.threads)
|
||||||
|
else:
|
||||||
|
break
|
1
listdownloader/__init__.py
Normal file
1
listdownloader/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from listdownloader.downloader import *
|
191
listdownloader/downloader.py
Normal file
191
listdownloader/downloader.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
import urllib
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import hashlib
|
||||||
|
import random
|
||||||
|
import datetime
|
||||||
|
import multiprocessing as mp
|
||||||
|
import errno
|
||||||
|
import re
|
||||||
|
|
||||||
|
random.seed(datetime.datetime.now())
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir_p(path):
|
||||||
|
"""
|
||||||
|
Create directory incrementally whether and don't raise if it exists
|
||||||
|
:param path: directory path
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.makedirs(path)
|
||||||
|
except OSError as exc:
|
||||||
|
if exc.errno == errno.EEXIST and os.path.isdir(path):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def md5_file(file_name):
|
||||||
|
"""
|
||||||
|
Calculate md5 of a file
|
||||||
|
:param file_name: path of the file, for which md5 to be calculated
|
||||||
|
:return: md5 string
|
||||||
|
"""
|
||||||
|
hash_md5 = hashlib.md5()
|
||||||
|
with open(file_name, "rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(4096), b""):
|
||||||
|
hash_md5.update(chunk)
|
||||||
|
return hash_md5.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_hash(bits=32):
|
||||||
|
"""
|
||||||
|
Create a random hash that with "bits" length
|
||||||
|
:param bits: number of bits of the hash
|
||||||
|
:return: the hash value
|
||||||
|
"""
|
||||||
|
assert bits % 8 == 0
|
||||||
|
required_length = bits / 8 * 2
|
||||||
|
s = hex(random.getrandbits(bits)).lstrip('0x').rstrip('L')
|
||||||
|
if len(s) < required_length:
|
||||||
|
return get_random_hash(bits)
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def files_are_same(file1, file2):
|
||||||
|
"""
|
||||||
|
Checks whether two files in two paths are the same.
|
||||||
|
First by checking their sizes. If the sizes are equal, an MD5 checksum is calculated.
|
||||||
|
:param file1: first file
|
||||||
|
:param file2: second file
|
||||||
|
:return: True if files are the same, False otherwise
|
||||||
|
"""
|
||||||
|
if file1 == file2:
|
||||||
|
return True
|
||||||
|
|
||||||
|
size1 = os.path.getsize(file1)
|
||||||
|
size2 = os.path.getsize(file2)
|
||||||
|
if size1 != size2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
else:
|
||||||
|
md5file1 = md5_file(file1)
|
||||||
|
md5file2 = md5_file(file2)
|
||||||
|
return md5file1 == md5file2
|
||||||
|
|
||||||
|
|
||||||
|
def rename_file_with_number(file_name, num):
|
||||||
|
"""
|
||||||
|
Returns the same file name, but with a number added.
|
||||||
|
myfile.doc becomes myfile_1.doc or myfile_num.doc
|
||||||
|
:param file_name: file to be renamed
|
||||||
|
:param num: number to add to filename
|
||||||
|
:return: renamed file name
|
||||||
|
"""
|
||||||
|
file_name_parts = file_name.split(".")
|
||||||
|
return "".join(file_name_parts[0:-1]) + "_" + str(num) + "." + file_name_parts[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, to_dir):
|
||||||
|
|
||||||
|
"""
|
||||||
|
A function that downloads a file to a specific directory.
|
||||||
|
If the file already exists, it tries to add a number to the file name.
|
||||||
|
If the same file exists, the file is not downloaded again.
|
||||||
|
The check whether it's the same file is done through file size and md5 checksum
|
||||||
|
:param url: url of the file to be downloaded
|
||||||
|
:param to_dir: directory, to which the file should be downloaded
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
|
||||||
|
file_name = os.path.normpath(url.split('/')[-1]).replace(" ","")
|
||||||
|
file_name = re.sub('[^\w_.)( -]', '', file_name) # remove invalid characters from filename
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url) as response:
|
||||||
|
data = response.read()
|
||||||
|
info = response.info()
|
||||||
|
ext = info.get_content_subtype()
|
||||||
|
if len(file_name) < len(ext) or file_name[-len(ext):] != ext: # add file extension if not present
|
||||||
|
file_name += "." + ext
|
||||||
|
except urllib.error.URLError:
|
||||||
|
sys.stderr.write("Skipping invalid URL, or possibly failed to get URL: " + url)
|
||||||
|
return
|
||||||
|
|
||||||
|
target_path = os.path.join(to_dir, file_name)
|
||||||
|
target_temp_path = os.path.join(to_dir, file_name + "_" + get_random_hash())
|
||||||
|
|
||||||
|
with open(target_temp_path, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
# check if a file with the same name exists already
|
||||||
|
if os.path.isfile(target_path):
|
||||||
|
# if file exists, compare with the downloaded file
|
||||||
|
if files_are_same(target_path, target_temp_path):
|
||||||
|
# if it's the same file, just remove the temp file and return
|
||||||
|
os.remove(target_temp_path)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# if it's not the same file, loop over new file names with numbers, and redo the file name check
|
||||||
|
idx = 0
|
||||||
|
while True:
|
||||||
|
idx += 1
|
||||||
|
num_file_name = rename_file_with_number(file_name, idx)
|
||||||
|
target_path = os.path.join(to_dir, num_file_name)
|
||||||
|
if os.path.isfile(target_path):
|
||||||
|
if files_are_same(target_path, target_temp_path):
|
||||||
|
os.remove(target_temp_path)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
# if the downloaded file will not overwrite anything, rename the temp file to its proper name
|
||||||
|
os.rename(target_temp_path, target_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _download_files(list_of_urls, to_dir):
|
||||||
|
"""
|
||||||
|
Download list of urls to a directory sequentially
|
||||||
|
:param list_of_urls: list of urls to download
|
||||||
|
:param to_dir: destination directory
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
for url in list_of_urls:
|
||||||
|
download_file(url, to_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def download_files(list_of_urls, to_dir, processes=0):
|
||||||
|
"""
|
||||||
|
Downloads a list of urls in parallel if possible, otherwise sequentially
|
||||||
|
:param list_of_urls: list of urls to download
|
||||||
|
:param to_dir: destination directory
|
||||||
|
:param processes: number of processes/threads
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
|
||||||
|
# clean spaces, tabs and new-lines
|
||||||
|
list_of_urls = [line.replace(' ', '').replace('\n', '').replace('\t', '') for line in list_of_urls]
|
||||||
|
if not os.path.isdir(to_dir):
|
||||||
|
mkdir_p(to_dir)
|
||||||
|
if processes <= 0:
|
||||||
|
try:
|
||||||
|
processes = mp.cpu_count()
|
||||||
|
except NotImplementedError as e:
|
||||||
|
sys.stderr.write("Unable to determine the number of CPUs for parallelization. Proceeding sequentially. "
|
||||||
|
"Consider inputting the number of CPUs manually.\n")
|
||||||
|
_download_files(list_of_urls, to_dir)
|
||||||
|
return
|
||||||
|
elif processes == 1 or len(list_of_urls) == 1:
|
||||||
|
_download_files(list_of_urls, to_dir)
|
||||||
|
return
|
||||||
|
elif processes > len(list_of_urls):
|
||||||
|
processes = len(list_of_urls)
|
||||||
|
|
||||||
|
params = [(list_of_urls[i], to_dir) for i in range(len(list_of_urls))]
|
||||||
|
pool = mp.Pool(processes)
|
||||||
|
pool.starmap(download_file, params)
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
17
setup.py
Normal file
17
setup.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from distutils.core import setup
|
||||||
|
import os
|
||||||
|
|
||||||
|
del os.link
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="listdownloader",
|
||||||
|
version="0.1.0",
|
||||||
|
author="Samer Afach",
|
||||||
|
author_email="samer@afach.de",
|
||||||
|
packages=["listdownloader"],
|
||||||
|
include_package_data=True,
|
||||||
|
url="https://git.afach.de/samerafach/ListDownloader",
|
||||||
|
description="Downloads a list of files",
|
||||||
|
install_requires=[],
|
||||||
|
scripts=['bin/downloadlist.py']
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user