commit 3f9681b62101286bc599ab7bf0f77897c3bfa49a Author: Samer Afach Date: Sat Nov 12 23:25:42 2016 +0100 First add for tested ListDownloader diff --git a/README.md b/README.md new file mode 100644 index 0000000..170219e --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +ListDownloader +==================== + +About +-------------------- +This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used. + +Installation +-------------------- + +(Installation was prepared for and tested with Debian Jessie.) + +You can install the package with pip using + + # pip install listdownloader + +Or you can create the installation package yourself from the source using + + python3 setup.py sdist + +and then use pip to install the package that will be built in the directory `dist`: + + # pip3 install listdownloader-x.y.z.tar.gz + +where x.y.z is the current version of the program. + +The program installs the package listdownloader and a script file for usage. + +Running the script and using the package +---------------------------------------- + +The script can be executed (globally) using: + + $ downloadlist.py -f file.txt -d destination -t threads -l lines + +where: + `file.txt` is the file name/path with the list of URLs to be downloaded + `destination` is the path, to which the files should be downloaded + `threads` is the number of processes to be used to download the URLs simultaneously + `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file. + +You may use the package in your own scripts by importing it: + + import listdownloader + +then you can download a list of files using: + + listdownloader.download_files(URLs, destination, num_threads) + +where: + `URLs` is a list of the URLs to be downloaded + `destination` is a string with the path, at which the files have to be saved + `num_threads` is the number of threads/processes to use for the download. + +You can also download a single file using the function: + + listdownloader.download_file(URL, destination) + +License +------- +MPL + +About +----- +This script was written by Samer Afach, samer@afach.de for test purposes. diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..170219e --- /dev/null +++ b/README.txt @@ -0,0 +1,65 @@ +ListDownloader +==================== + +About +-------------------- +This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used. + +Installation +-------------------- + +(Installation was prepared for and tested with Debian Jessie.) + +You can install the package with pip using + + # pip install listdownloader + +Or you can create the installation package yourself from the source using + + python3 setup.py sdist + +and then use pip to install the package that will be built in the directory `dist`: + + # pip3 install listdownloader-x.y.z.tar.gz + +where x.y.z is the current version of the program. + +The program installs the package listdownloader and a script file for usage. + +Running the script and using the package +---------------------------------------- + +The script can be executed (globally) using: + + $ downloadlist.py -f file.txt -d destination -t threads -l lines + +where: + `file.txt` is the file name/path with the list of URLs to be downloaded + `destination` is the path, to which the files should be downloaded + `threads` is the number of processes to be used to download the URLs simultaneously + `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file. + +You may use the package in your own scripts by importing it: + + import listdownloader + +then you can download a list of files using: + + listdownloader.download_files(URLs, destination, num_threads) + +where: + `URLs` is a list of the URLs to be downloaded + `destination` is a string with the path, at which the files have to be saved + `num_threads` is the number of threads/processes to use for the download. + +You can also download a single file using the function: + + listdownloader.download_file(URL, destination) + +License +------- +MPL + +About +----- +This script was written by Samer Afach, samer@afach.de for test purposes. diff --git a/bin/downloadlist.py b/bin/downloadlist.py new file mode 100644 index 0000000..caa7b34 --- /dev/null +++ b/bin/downloadlist.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 + +import listdownloader +import argparse +from itertools import islice + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--file", type=str, dest="filepath", default="", + help="Text file that contains links to download, line by line", required=True) + parser.add_argument("-d", "--destination", type=str, dest="dest", default="", + help="Destination path to download the files", required=True) + parser.add_argument("-t", "--threads", type=int, dest="threads", default=0, + help="Number of threads/processes to be used") + parser.add_argument("-l", "--lines", type=int, dest="numlines", default=0, + help="Number of lines to be read as a chunk. 0=Read the whole file.") + args = parser.parse_args() + + if args.numlines <= 0: # load the whole file + with open(args.filepath) as f: + file_lines = f.readlines() + listdownloader.download_files(file_lines, args.dest, args.threads) + + else: # load parts of the file + with open(args.filepath, 'r') as infile: + while True: + file_lines = list(islice(infile, args.numlines)) + if len(file_lines) > 0: + print(file_lines) + listdownloader.download_files(file_lines, args.dest, args.threads) + else: + break diff --git a/listdownloader/__init__.py b/listdownloader/__init__.py new file mode 100644 index 0000000..3c3adf7 --- /dev/null +++ b/listdownloader/__init__.py @@ -0,0 +1 @@ +from listdownloader.downloader import * diff --git a/listdownloader/downloader.py b/listdownloader/downloader.py new file mode 100644 index 0000000..17b3258 --- /dev/null +++ b/listdownloader/downloader.py @@ -0,0 +1,191 @@ +import urllib +import urllib.request +import urllib.error +import os +import sys +import hashlib +import random +import datetime +import multiprocessing as mp +import errno +import re + +random.seed(datetime.datetime.now()) + + +def mkdir_p(path): + """ + Create directory incrementally whether and don't raise if it exists + :param path: directory path + :return: None + """ + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +def md5_file(file_name): + """ + Calculate md5 of a file + :param file_name: path of the file, for which md5 to be calculated + :return: md5 string + """ + hash_md5 = hashlib.md5() + with open(file_name, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def get_random_hash(bits=32): + """ + Create a random hash that with "bits" length + :param bits: number of bits of the hash + :return: the hash value + """ + assert bits % 8 == 0 + required_length = bits / 8 * 2 + s = hex(random.getrandbits(bits)).lstrip('0x').rstrip('L') + if len(s) < required_length: + return get_random_hash(bits) + else: + return s + + +def files_are_same(file1, file2): + """ + Checks whether two files in two paths are the same. + First by checking their sizes. If the sizes are equal, an MD5 checksum is calculated. + :param file1: first file + :param file2: second file + :return: True if files are the same, False otherwise + """ + if file1 == file2: + return True + + size1 = os.path.getsize(file1) + size2 = os.path.getsize(file2) + if size1 != size2: + return False + + else: + md5file1 = md5_file(file1) + md5file2 = md5_file(file2) + return md5file1 == md5file2 + + +def rename_file_with_number(file_name, num): + """ + Returns the same file name, but with a number added. + myfile.doc becomes myfile_1.doc or myfile_num.doc + :param file_name: file to be renamed + :param num: number to add to filename + :return: renamed file name + """ + file_name_parts = file_name.split(".") + return "".join(file_name_parts[0:-1]) + "_" + str(num) + "." + file_name_parts[-1] + + +def download_file(url, to_dir): + + """ + A function that downloads a file to a specific directory. + If the file already exists, it tries to add a number to the file name. + If the same file exists, the file is not downloaded again. + The check whether it's the same file is done through file size and md5 checksum + :param url: url of the file to be downloaded + :param to_dir: directory, to which the file should be downloaded + :return: None + """ + + file_name = os.path.normpath(url.split('/')[-1]).replace(" ","") + file_name = re.sub('[^\w_.)( -]', '', file_name) # remove invalid characters from filename + try: + with urllib.request.urlopen(url) as response: + data = response.read() + info = response.info() + ext = info.get_content_subtype() + if len(file_name) < len(ext) or file_name[-len(ext):] != ext: # add file extension if not present + file_name += "." + ext + except urllib.error.URLError: + sys.stderr.write("Skipping invalid URL, or possibly failed to get URL: " + url) + return + + target_path = os.path.join(to_dir, file_name) + target_temp_path = os.path.join(to_dir, file_name + "_" + get_random_hash()) + + with open(target_temp_path, 'wb') as f: + f.write(data) + + # check if a file with the same name exists already + if os.path.isfile(target_path): + # if file exists, compare with the downloaded file + if files_are_same(target_path, target_temp_path): + # if it's the same file, just remove the temp file and return + os.remove(target_temp_path) + return + else: + # if it's not the same file, loop over new file names with numbers, and redo the file name check + idx = 0 + while True: + idx += 1 + num_file_name = rename_file_with_number(file_name, idx) + target_path = os.path.join(to_dir, num_file_name) + if os.path.isfile(target_path): + if files_are_same(target_path, target_temp_path): + os.remove(target_temp_path) + return + else: + break + + # if the downloaded file will not overwrite anything, rename the temp file to its proper name + os.rename(target_temp_path, target_path) + + +def _download_files(list_of_urls, to_dir): + """ + Download list of urls to a directory sequentially + :param list_of_urls: list of urls to download + :param to_dir: destination directory + :return: None + """ + for url in list_of_urls: + download_file(url, to_dir) + + +def download_files(list_of_urls, to_dir, processes=0): + """ + Downloads a list of urls in parallel if possible, otherwise sequentially + :param list_of_urls: list of urls to download + :param to_dir: destination directory + :param processes: number of processes/threads + :return: None + """ + + # clean spaces, tabs and new-lines + list_of_urls = [line.replace(' ', '').replace('\n', '').replace('\t', '') for line in list_of_urls] + if not os.path.isdir(to_dir): + mkdir_p(to_dir) + if processes <= 0: + try: + processes = mp.cpu_count() + except NotImplementedError as e: + sys.stderr.write("Unable to determine the number of CPUs for parallelization. Proceeding sequentially. " + "Consider inputting the number of CPUs manually.\n") + _download_files(list_of_urls, to_dir) + return + elif processes == 1 or len(list_of_urls) == 1: + _download_files(list_of_urls, to_dir) + return + elif processes > len(list_of_urls): + processes = len(list_of_urls) + + params = [(list_of_urls[i], to_dir) for i in range(len(list_of_urls))] + pool = mp.Pool(processes) + pool.starmap(download_file, params) + pool.close() + pool.join() \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e62a725 --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +from distutils.core import setup +import os + +del os.link + +setup( + name="listdownloader", + version="0.1.0", + author="Samer Afach", + author_email="samer@afach.de", + packages=["listdownloader"], + include_package_data=True, + url="https://git.afach.de/samerafach/ListDownloader", + description="Downloads a list of files", + install_requires=[], + scripts=['bin/downloadlist.py'] +)