First add for tested ListDownloader
This commit is contained in:
commit
3f9681b621
65
README.md
Normal file
65
README.md
Normal file
@ -0,0 +1,65 @@
|
||||
ListDownloader
|
||||
====================
|
||||
|
||||
About
|
||||
--------------------
|
||||
This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
|
||||
|
||||
Installation
|
||||
--------------------
|
||||
|
||||
(Installation was prepared for and tested with Debian Jessie.)
|
||||
|
||||
You can install the package with pip using
|
||||
|
||||
# pip install listdownloader
|
||||
|
||||
Or you can create the installation package yourself from the source using
|
||||
|
||||
python3 setup.py sdist
|
||||
|
||||
and then use pip to install the package that will be built in the directory `dist`:
|
||||
|
||||
# pip3 install listdownloader-x.y.z.tar.gz
|
||||
|
||||
where x.y.z is the current version of the program.
|
||||
|
||||
The program installs the package listdownloader and a script file for usage.
|
||||
|
||||
Running the script and using the package
|
||||
----------------------------------------
|
||||
|
||||
The script can be executed (globally) using:
|
||||
|
||||
$ downloadlist.py -f file.txt -d destination -t threads -l lines
|
||||
|
||||
where:
|
||||
`file.txt` is the file name/path with the list of URLs to be downloaded
|
||||
`destination` is the path, to which the files should be downloaded
|
||||
`threads` is the number of processes to be used to download the URLs simultaneously
|
||||
`lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
|
||||
|
||||
You may use the package in your own scripts by importing it:
|
||||
|
||||
import listdownloader
|
||||
|
||||
then you can download a list of files using:
|
||||
|
||||
listdownloader.download_files(URLs, destination, num_threads)
|
||||
|
||||
where:
|
||||
`URLs` is a list of the URLs to be downloaded
|
||||
`destination` is a string with the path, at which the files have to be saved
|
||||
`num_threads` is the number of threads/processes to use for the download.
|
||||
|
||||
You can also download a single file using the function:
|
||||
|
||||
listdownloader.download_file(URL, destination)
|
||||
|
||||
License
|
||||
-------
|
||||
MPL
|
||||
|
||||
About
|
||||
-----
|
||||
This script was written by Samer Afach, samer@afach.de for test purposes.
|
65
README.txt
Normal file
65
README.txt
Normal file
@ -0,0 +1,65 @@
|
||||
ListDownloader
|
||||
====================
|
||||
|
||||
About
|
||||
--------------------
|
||||
This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
|
||||
|
||||
Installation
|
||||
--------------------
|
||||
|
||||
(Installation was prepared for and tested with Debian Jessie.)
|
||||
|
||||
You can install the package with pip using
|
||||
|
||||
# pip install listdownloader
|
||||
|
||||
Or you can create the installation package yourself from the source using
|
||||
|
||||
python3 setup.py sdist
|
||||
|
||||
and then use pip to install the package that will be built in the directory `dist`:
|
||||
|
||||
# pip3 install listdownloader-x.y.z.tar.gz
|
||||
|
||||
where x.y.z is the current version of the program.
|
||||
|
||||
The program installs the package listdownloader and a script file for usage.
|
||||
|
||||
Running the script and using the package
|
||||
----------------------------------------
|
||||
|
||||
The script can be executed (globally) using:
|
||||
|
||||
$ downloadlist.py -f file.txt -d destination -t threads -l lines
|
||||
|
||||
where:
|
||||
`file.txt` is the file name/path with the list of URLs to be downloaded
|
||||
`destination` is the path, to which the files should be downloaded
|
||||
`threads` is the number of processes to be used to download the URLs simultaneously
|
||||
`lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
|
||||
|
||||
You may use the package in your own scripts by importing it:
|
||||
|
||||
import listdownloader
|
||||
|
||||
then you can download a list of files using:
|
||||
|
||||
listdownloader.download_files(URLs, destination, num_threads)
|
||||
|
||||
where:
|
||||
`URLs` is a list of the URLs to be downloaded
|
||||
`destination` is a string with the path, at which the files have to be saved
|
||||
`num_threads` is the number of threads/processes to use for the download.
|
||||
|
||||
You can also download a single file using the function:
|
||||
|
||||
listdownloader.download_file(URL, destination)
|
||||
|
||||
License
|
||||
-------
|
||||
MPL
|
||||
|
||||
About
|
||||
-----
|
||||
This script was written by Samer Afach, samer@afach.de for test purposes.
|
33
bin/downloadlist.py
Normal file
33
bin/downloadlist.py
Normal file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import listdownloader
|
||||
import argparse
|
||||
from itertools import islice
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-f", "--file", type=str, dest="filepath", default="",
|
||||
help="Text file that contains links to download, line by line", required=True)
|
||||
parser.add_argument("-d", "--destination", type=str, dest="dest", default="",
|
||||
help="Destination path to download the files", required=True)
|
||||
parser.add_argument("-t", "--threads", type=int, dest="threads", default=0,
|
||||
help="Number of threads/processes to be used")
|
||||
parser.add_argument("-l", "--lines", type=int, dest="numlines", default=0,
|
||||
help="Number of lines to be read as a chunk. 0=Read the whole file.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.numlines <= 0: # load the whole file
|
||||
with open(args.filepath) as f:
|
||||
file_lines = f.readlines()
|
||||
listdownloader.download_files(file_lines, args.dest, args.threads)
|
||||
|
||||
else: # load parts of the file
|
||||
with open(args.filepath, 'r') as infile:
|
||||
while True:
|
||||
file_lines = list(islice(infile, args.numlines))
|
||||
if len(file_lines) > 0:
|
||||
print(file_lines)
|
||||
listdownloader.download_files(file_lines, args.dest, args.threads)
|
||||
else:
|
||||
break
|
1
listdownloader/__init__.py
Normal file
1
listdownloader/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from listdownloader.downloader import *
|
191
listdownloader/downloader.py
Normal file
191
listdownloader/downloader.py
Normal file
@ -0,0 +1,191 @@
|
||||
import urllib
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import os
|
||||
import sys
|
||||
import hashlib
|
||||
import random
|
||||
import datetime
|
||||
import multiprocessing as mp
|
||||
import errno
|
||||
import re
|
||||
|
||||
random.seed(datetime.datetime.now())
|
||||
|
||||
|
||||
def mkdir_p(path):
|
||||
"""
|
||||
Create directory incrementally whether and don't raise if it exists
|
||||
:param path: directory path
|
||||
:return: None
|
||||
"""
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except OSError as exc:
|
||||
if exc.errno == errno.EEXIST and os.path.isdir(path):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def md5_file(file_name):
|
||||
"""
|
||||
Calculate md5 of a file
|
||||
:param file_name: path of the file, for which md5 to be calculated
|
||||
:return: md5 string
|
||||
"""
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(file_name, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def get_random_hash(bits=32):
|
||||
"""
|
||||
Create a random hash that with "bits" length
|
||||
:param bits: number of bits of the hash
|
||||
:return: the hash value
|
||||
"""
|
||||
assert bits % 8 == 0
|
||||
required_length = bits / 8 * 2
|
||||
s = hex(random.getrandbits(bits)).lstrip('0x').rstrip('L')
|
||||
if len(s) < required_length:
|
||||
return get_random_hash(bits)
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def files_are_same(file1, file2):
|
||||
"""
|
||||
Checks whether two files in two paths are the same.
|
||||
First by checking their sizes. If the sizes are equal, an MD5 checksum is calculated.
|
||||
:param file1: first file
|
||||
:param file2: second file
|
||||
:return: True if files are the same, False otherwise
|
||||
"""
|
||||
if file1 == file2:
|
||||
return True
|
||||
|
||||
size1 = os.path.getsize(file1)
|
||||
size2 = os.path.getsize(file2)
|
||||
if size1 != size2:
|
||||
return False
|
||||
|
||||
else:
|
||||
md5file1 = md5_file(file1)
|
||||
md5file2 = md5_file(file2)
|
||||
return md5file1 == md5file2
|
||||
|
||||
|
||||
def rename_file_with_number(file_name, num):
|
||||
"""
|
||||
Returns the same file name, but with a number added.
|
||||
myfile.doc becomes myfile_1.doc or myfile_num.doc
|
||||
:param file_name: file to be renamed
|
||||
:param num: number to add to filename
|
||||
:return: renamed file name
|
||||
"""
|
||||
file_name_parts = file_name.split(".")
|
||||
return "".join(file_name_parts[0:-1]) + "_" + str(num) + "." + file_name_parts[-1]
|
||||
|
||||
|
||||
def download_file(url, to_dir):
|
||||
|
||||
"""
|
||||
A function that downloads a file to a specific directory.
|
||||
If the file already exists, it tries to add a number to the file name.
|
||||
If the same file exists, the file is not downloaded again.
|
||||
The check whether it's the same file is done through file size and md5 checksum
|
||||
:param url: url of the file to be downloaded
|
||||
:param to_dir: directory, to which the file should be downloaded
|
||||
:return: None
|
||||
"""
|
||||
|
||||
file_name = os.path.normpath(url.split('/')[-1]).replace(" ","")
|
||||
file_name = re.sub('[^\w_.)( -]', '', file_name) # remove invalid characters from filename
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = response.read()
|
||||
info = response.info()
|
||||
ext = info.get_content_subtype()
|
||||
if len(file_name) < len(ext) or file_name[-len(ext):] != ext: # add file extension if not present
|
||||
file_name += "." + ext
|
||||
except urllib.error.URLError:
|
||||
sys.stderr.write("Skipping invalid URL, or possibly failed to get URL: " + url)
|
||||
return
|
||||
|
||||
target_path = os.path.join(to_dir, file_name)
|
||||
target_temp_path = os.path.join(to_dir, file_name + "_" + get_random_hash())
|
||||
|
||||
with open(target_temp_path, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
# check if a file with the same name exists already
|
||||
if os.path.isfile(target_path):
|
||||
# if file exists, compare with the downloaded file
|
||||
if files_are_same(target_path, target_temp_path):
|
||||
# if it's the same file, just remove the temp file and return
|
||||
os.remove(target_temp_path)
|
||||
return
|
||||
else:
|
||||
# if it's not the same file, loop over new file names with numbers, and redo the file name check
|
||||
idx = 0
|
||||
while True:
|
||||
idx += 1
|
||||
num_file_name = rename_file_with_number(file_name, idx)
|
||||
target_path = os.path.join(to_dir, num_file_name)
|
||||
if os.path.isfile(target_path):
|
||||
if files_are_same(target_path, target_temp_path):
|
||||
os.remove(target_temp_path)
|
||||
return
|
||||
else:
|
||||
break
|
||||
|
||||
# if the downloaded file will not overwrite anything, rename the temp file to its proper name
|
||||
os.rename(target_temp_path, target_path)
|
||||
|
||||
|
||||
def _download_files(list_of_urls, to_dir):
|
||||
"""
|
||||
Download list of urls to a directory sequentially
|
||||
:param list_of_urls: list of urls to download
|
||||
:param to_dir: destination directory
|
||||
:return: None
|
||||
"""
|
||||
for url in list_of_urls:
|
||||
download_file(url, to_dir)
|
||||
|
||||
|
||||
def download_files(list_of_urls, to_dir, processes=0):
|
||||
"""
|
||||
Downloads a list of urls in parallel if possible, otherwise sequentially
|
||||
:param list_of_urls: list of urls to download
|
||||
:param to_dir: destination directory
|
||||
:param processes: number of processes/threads
|
||||
:return: None
|
||||
"""
|
||||
|
||||
# clean spaces, tabs and new-lines
|
||||
list_of_urls = [line.replace(' ', '').replace('\n', '').replace('\t', '') for line in list_of_urls]
|
||||
if not os.path.isdir(to_dir):
|
||||
mkdir_p(to_dir)
|
||||
if processes <= 0:
|
||||
try:
|
||||
processes = mp.cpu_count()
|
||||
except NotImplementedError as e:
|
||||
sys.stderr.write("Unable to determine the number of CPUs for parallelization. Proceeding sequentially. "
|
||||
"Consider inputting the number of CPUs manually.\n")
|
||||
_download_files(list_of_urls, to_dir)
|
||||
return
|
||||
elif processes == 1 or len(list_of_urls) == 1:
|
||||
_download_files(list_of_urls, to_dir)
|
||||
return
|
||||
elif processes > len(list_of_urls):
|
||||
processes = len(list_of_urls)
|
||||
|
||||
params = [(list_of_urls[i], to_dir) for i in range(len(list_of_urls))]
|
||||
pool = mp.Pool(processes)
|
||||
pool.starmap(download_file, params)
|
||||
pool.close()
|
||||
pool.join()
|
17
setup.py
Normal file
17
setup.py
Normal file
@ -0,0 +1,17 @@
|
||||
from distutils.core import setup
|
||||
import os
|
||||
|
||||
del os.link
|
||||
|
||||
setup(
|
||||
name="listdownloader",
|
||||
version="0.1.0",
|
||||
author="Samer Afach",
|
||||
author_email="samer@afach.de",
|
||||
packages=["listdownloader"],
|
||||
include_package_data=True,
|
||||
url="https://git.afach.de/samerafach/ListDownloader",
|
||||
description="Downloads a list of files",
|
||||
install_requires=[],
|
||||
scripts=['bin/downloadlist.py']
|
||||
)
|
Loading…
Reference in New Issue
Block a user