First add for tested ListDownloader

2016-11-12 23:25:42 +01:00
commit 3f9681b621
6 changed files with 372 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,65 @@
 ListDownloader
 ====================
 About
 --------------------
 This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
 Installation
 --------------------
 (Installation was prepared for and tested with Debian Jessie.)
 You can install the package with pip using
    # pip install listdownloader
 Or you can create the installation package yourself from the source using
    python3 setup.py sdist
 and then use pip to install the package that will be built in the directory `dist`:
    # pip3 install listdownloader-x.y.z.tar.gz
 where x.y.z is the current version of the program.
 The program installs the package listdownloader and a script file for usage.
 Running the script and using the package
 ----------------------------------------
 The script can be executed (globally) using:
    $ downloadlist.py -f file.txt -d destination -t threads -l lines
 where:
    `file.txt` is the file name/path with the list of URLs to be downloaded
    `destination` is the path, to which the files should be downloaded
    `threads` is the number of processes to be used to download the URLs simultaneously
    `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
 You may use the package in your own scripts by importing it:
    import listdownloader
 then you can download a list of files using:
    listdownloader.download_files(URLs, destination, num_threads)
 where:
    `URLs` is a list of the URLs to be downloaded
    `destination` is a string with the path, at which the files have to be saved
    `num_threads` is the number of threads/processes to use for the download.
 You can also download a single file using the function:
    listdownloader.download_file(URL, destination)
 License
 -------
 MPL
 About
 -----
 This script was written by Samer Afach, samer@afach.de for test purposes.
--- a/README.txt
+++ b/README.txt
@@ -0,0 +1,65 @@
 ListDownloader
 ====================
 About
 --------------------
 This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
 Installation
 --------------------
 (Installation was prepared for and tested with Debian Jessie.)
 You can install the package with pip using
    # pip install listdownloader
 Or you can create the installation package yourself from the source using
    python3 setup.py sdist
 and then use pip to install the package that will be built in the directory `dist`:
    # pip3 install listdownloader-x.y.z.tar.gz
 where x.y.z is the current version of the program.
 The program installs the package listdownloader and a script file for usage.
 Running the script and using the package
 ----------------------------------------
 The script can be executed (globally) using:
    $ downloadlist.py -f file.txt -d destination -t threads -l lines
 where:
    `file.txt` is the file name/path with the list of URLs to be downloaded
    `destination` is the path, to which the files should be downloaded
    `threads` is the number of processes to be used to download the URLs simultaneously
    `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
 You may use the package in your own scripts by importing it:
    import listdownloader
 then you can download a list of files using:
    listdownloader.download_files(URLs, destination, num_threads)
 where:
    `URLs` is a list of the URLs to be downloaded
    `destination` is a string with the path, at which the files have to be saved
    `num_threads` is the number of threads/processes to use for the download.
 You can also download a single file using the function:
    listdownloader.download_file(URL, destination)
 License
 -------
 MPL
 About
 -----
 This script was written by Samer Afach, samer@afach.de for test purposes.
--- a/bin/downloadlist.py
+++ b/bin/downloadlist.py
@@ -0,0 +1,33 @@
 #!/usr/bin/python3
 import listdownloader
 import argparse
 from itertools import islice
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--file", type=str, dest="filepath", default="",
                        help="Text file that contains links to download, line by line", required=True)
    parser.add_argument("-d", "--destination", type=str, dest="dest", default="",
                        help="Destination path to download the files", required=True)
    parser.add_argument("-t", "--threads", type=int, dest="threads", default=0,
                        help="Number of threads/processes to be used")
    parser.add_argument("-l", "--lines", type=int, dest="numlines", default=0,
                        help="Number of lines to be read as a chunk. 0=Read the whole file.")
    args = parser.parse_args()
    if args.numlines <= 0: # load the whole file
        with open(args.filepath) as f:
            file_lines = f.readlines()
            listdownloader.download_files(file_lines, args.dest, args.threads)
    else: # load parts of the file
        with open(args.filepath, 'r') as infile:
            while True:
                file_lines = list(islice(infile, args.numlines))
                if len(file_lines) > 0:
                    print(file_lines)
                    listdownloader.download_files(file_lines, args.dest, args.threads)
                else:
                    break
--- a/listdownloader/init.py
+++ b/listdownloader/init.py
@@ -0,0 +1 @@
 from listdownloader.downloader import *
--- a/listdownloader/downloader.py
+++ b/listdownloader/downloader.py
@@ -0,0 +1,191 @@
 import urllib
 import urllib.request
 import urllib.error
 import os
 import sys
 import hashlib
 import random
 import datetime
 import multiprocessing as mp
 import errno
 import re
 random.seed(datetime.datetime.now())
 def mkdir_p(path):
    """
    Create directory incrementally whether and don't raise if it exists
    :param path: directory path
    :return: None
    """
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
 def md5_file(file_name):
    """
    Calculate md5 of a file
    :param file_name: path of the file, for which md5 to be calculated
    :return: md5 string
    """
    hash_md5 = hashlib.md5()
    with open(file_name, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()
 def get_random_hash(bits=32):
    """
    Create a random hash that with "bits" length
    :param bits: number of bits of the hash
    :return: the hash value
    """
    assert bits % 8 == 0
    required_length = bits / 8 * 2
    s = hex(random.getrandbits(bits)).lstrip('0x').rstrip('L')
    if len(s) < required_length:
        return get_random_hash(bits)
    else:
        return s
 def files_are_same(file1, file2):
    """
    Checks whether two files in two paths are the same.
    First by checking their sizes. If the sizes are equal, an MD5 checksum is calculated.
    :param file1: first file
    :param file2: second file
    :return: True if files are the same, False otherwise
    """
    if file1 == file2:
        return True
    size1 = os.path.getsize(file1)
    size2 = os.path.getsize(file2)
    if size1 != size2:
        return False
    else:
        md5file1 = md5_file(file1)
        md5file2 = md5_file(file2)
        return md5file1 == md5file2
 def rename_file_with_number(file_name, num):
    """
    Returns the same file name, but with a number added.
    myfile.doc becomes myfile_1.doc or myfile_num.doc
    :param file_name: file to be renamed
    :param num: number to add to filename
    :return: renamed file name
    """
    file_name_parts = file_name.split(".")
    return "".join(file_name_parts[0:-1]) + "_" + str(num) + "." + file_name_parts[-1]
 def download_file(url, to_dir):
    """
    A function that downloads a file to a specific directory.
    If the file already exists, it tries to add a number to the file name.
    If the same file exists, the file is not downloaded again.
    The check whether it's the same file is done through file size and md5 checksum
    :param url: url of the file to be downloaded
    :param to_dir: directory, to which the file should be downloaded
    :return: None
    """
    file_name = os.path.normpath(url.split('/')[-1]).replace(" ","")
    file_name = re.sub('[^\w_.)( -]', '', file_name) # remove invalid characters from filename
    try:
        with urllib.request.urlopen(url) as response:
            data = response.read()
            info = response.info()
            ext = info.get_content_subtype()
            if len(file_name) < len(ext) or file_name[-len(ext):] != ext: # add file extension if not present
                file_name += "." + ext
    except urllib.error.URLError:
        sys.stderr.write("Skipping invalid URL, or possibly failed to get URL: " + url)
        return
    target_path = os.path.join(to_dir, file_name)
    target_temp_path = os.path.join(to_dir, file_name + "_" + get_random_hash())
    with open(target_temp_path, 'wb') as f:
        f.write(data)
    # check if a file with the same name exists already
    if os.path.isfile(target_path):
        # if file exists, compare with the downloaded file
        if files_are_same(target_path, target_temp_path):
            # if it's the same file, just remove the temp file and return
            os.remove(target_temp_path)
            return
        else:
            # if it's not the same file, loop over new file names with numbers, and redo the file name check
            idx = 0
            while True:
                idx += 1
                num_file_name = rename_file_with_number(file_name, idx)
                target_path = os.path.join(to_dir, num_file_name)
                if os.path.isfile(target_path):
                    if files_are_same(target_path, target_temp_path):
                        os.remove(target_temp_path)
                        return
                else:
                    break
    # if the downloaded file will not overwrite anything, rename the temp file to its proper name
    os.rename(target_temp_path, target_path)
 def _download_files(list_of_urls, to_dir):
    """
    Download list of urls to a directory sequentially
    :param list_of_urls: list of urls to download
    :param to_dir: destination directory
    :return: None
    """
    for url in list_of_urls:
        download_file(url, to_dir)
 def download_files(list_of_urls, to_dir, processes=0):
    """
    Downloads a list of urls in parallel if possible, otherwise sequentially
    :param list_of_urls: list of urls to download
    :param to_dir: destination directory
    :param processes: number of processes/threads
    :return: None
    """
    # clean spaces, tabs and new-lines
    list_of_urls = [line.replace(' ', '').replace('\n', '').replace('\t', '') for line in list_of_urls]
    if not os.path.isdir(to_dir):
        mkdir_p(to_dir)
    if processes <= 0:
        try:
            processes = mp.cpu_count()
        except NotImplementedError as e:
            sys.stderr.write("Unable to determine the number of CPUs for parallelization. Proceeding sequentially. "
                             "Consider inputting the number of CPUs manually.\n")
            _download_files(list_of_urls, to_dir)
            return
    elif processes == 1 or len(list_of_urls) == 1:
        _download_files(list_of_urls, to_dir)
        return
    elif processes > len(list_of_urls):
        processes = len(list_of_urls)
    params = [(list_of_urls[i], to_dir) for i in range(len(list_of_urls))]
    pool = mp.Pool(processes)
    pool.starmap(download_file, params)
    pool.close()
    pool.join()
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,17 @@
 from distutils.core import setup
 import os 
 del os.link
 setup(
    name="listdownloader",
    version="0.1.0",
    author="Samer Afach",
    author_email="samer@afach.de",
    packages=["listdownloader"],
    include_package_data=True,
    url="https://git.afach.de/samerafach/ListDownloader",
    description="Downloads a list of files",
    install_requires=[],
    scripts=['bin/downloadlist.py']
 )