commit 3f9681b62101286bc599ab7bf0f77897c3bfa49a
Author: Samer Afach <samer@afach.de>
Date:   Sat Nov 12 23:25:42 2016 +0100

    First add for tested ListDownloader

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..170219e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,65 @@
+ListDownloader
+====================
+
+About
+--------------------
+This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
+
+Installation
+--------------------
+
+(Installation was prepared for and tested with Debian Jessie.)
+
+You can install the package with pip using
+
+    # pip install listdownloader
+
+Or you can create the installation package yourself from the source using
+
+    python3 setup.py sdist
+
+and then use pip to install the package that will be built in the directory `dist`:
+
+    # pip3 install listdownloader-x.y.z.tar.gz
+
+where x.y.z is the current version of the program.
+
+The program installs the package listdownloader and a script file for usage.
+
+Running the script and using the package
+----------------------------------------
+
+The script can be executed (globally) using:
+
+    $ downloadlist.py -f file.txt -d destination -t threads -l lines
+
+where:
+    `file.txt` is the file name/path with the list of URLs to be downloaded
+    `destination` is the path, to which the files should be downloaded
+    `threads` is the number of processes to be used to download the URLs simultaneously
+    `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
+
+You may use the package in your own scripts by importing it:
+
+    import listdownloader
+
+then you can download a list of files using:
+
+    listdownloader.download_files(URLs, destination, num_threads)
+
+where:
+    `URLs` is a list of the URLs to be downloaded
+    `destination` is a string with the path, at which the files have to be saved
+    `num_threads` is the number of threads/processes to use for the download.
+
+You can also download a single file using the function:
+
+    listdownloader.download_file(URL, destination)
+
+License
+-------
+MPL
+
+About
+-----
+This script was written by Samer Afach, samer@afach.de for test purposes.
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..170219e
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,65 @@
+ListDownloader
+====================
+
+About
+--------------------
+This program simply takes a list of files as argument and a directory to download the files, and it downloads them sequentially, or in parallel. The program gives the option to load the whole list, or do parts of the list at a time. An option also is provided for how many threads/processes to be used.
+
+Installation
+--------------------
+
+(Installation was prepared for and tested with Debian Jessie.)
+
+You can install the package with pip using
+
+    # pip install listdownloader
+
+Or you can create the installation package yourself from the source using
+
+    python3 setup.py sdist
+
+and then use pip to install the package that will be built in the directory `dist`:
+
+    # pip3 install listdownloader-x.y.z.tar.gz
+
+where x.y.z is the current version of the program.
+
+The program installs the package listdownloader and a script file for usage.
+
+Running the script and using the package
+----------------------------------------
+
+The script can be executed (globally) using:
+
+    $ downloadlist.py -f file.txt -d destination -t threads -l lines
+
+where:
+    `file.txt` is the file name/path with the list of URLs to be downloaded
+    `destination` is the path, to which the files should be downloaded
+    `threads` is the number of processes to be used to download the URLs simultaneously
+    `lines` is the number of lines to read from the files and read simultaneously. 0 leads to reading the whole file.
+
+You may use the package in your own scripts by importing it:
+
+    import listdownloader
+
+then you can download a list of files using:
+
+    listdownloader.download_files(URLs, destination, num_threads)
+
+where:
+    `URLs` is a list of the URLs to be downloaded
+    `destination` is a string with the path, at which the files have to be saved
+    `num_threads` is the number of threads/processes to use for the download.
+
+You can also download a single file using the function:
+
+    listdownloader.download_file(URL, destination)
+
+License
+-------
+MPL
+
+About
+-----
+This script was written by Samer Afach, samer@afach.de for test purposes.
diff --git a/bin/downloadlist.py b/bin/downloadlist.py
new file mode 100644
index 0000000..caa7b34
--- /dev/null
+++ b/bin/downloadlist.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python3
+
+import listdownloader
+import argparse
+from itertools import islice
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--file", type=str, dest="filepath", default="",
+                        help="Text file that contains links to download, line by line", required=True)
+    parser.add_argument("-d", "--destination", type=str, dest="dest", default="",
+                        help="Destination path to download the files", required=True)
+    parser.add_argument("-t", "--threads", type=int, dest="threads", default=0,
+                        help="Number of threads/processes to be used")
+    parser.add_argument("-l", "--lines", type=int, dest="numlines", default=0,
+                        help="Number of lines to be read as a chunk. 0=Read the whole file.")
+    args = parser.parse_args()
+
+    if args.numlines <= 0: # load the whole file
+        with open(args.filepath) as f:
+            file_lines = f.readlines()
+            listdownloader.download_files(file_lines, args.dest, args.threads)
+
+    else: # load parts of the file
+        with open(args.filepath, 'r') as infile:
+            while True:
+                file_lines = list(islice(infile, args.numlines))
+                if len(file_lines) > 0:
+                    print(file_lines)
+                    listdownloader.download_files(file_lines, args.dest, args.threads)
+                else:
+                    break
diff --git a/listdownloader/__init__.py b/listdownloader/__init__.py
new file mode 100644
index 0000000..3c3adf7
--- /dev/null
+++ b/listdownloader/__init__.py
@@ -0,0 +1 @@
+from listdownloader.downloader import *
diff --git a/listdownloader/downloader.py b/listdownloader/downloader.py
new file mode 100644
index 0000000..17b3258
--- /dev/null
+++ b/listdownloader/downloader.py
@@ -0,0 +1,191 @@
+import urllib
+import urllib.request
+import urllib.error
+import os
+import sys
+import hashlib
+import random
+import datetime
+import multiprocessing as mp
+import errno
+import re
+
+random.seed(datetime.datetime.now())
+
+
+def mkdir_p(path):
+    """
+    Create directory incrementally whether and don't raise if it exists
+    :param path: directory path
+    :return: None
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def md5_file(file_name):
+    """
+    Calculate md5 of a file
+    :param file_name: path of the file, for which md5 to be calculated
+    :return: md5 string
+    """
+    hash_md5 = hashlib.md5()
+    with open(file_name, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def get_random_hash(bits=32):
+    """
+    Create a random hash that with "bits" length
+    :param bits: number of bits of the hash
+    :return: the hash value
+    """
+    assert bits % 8 == 0
+    required_length = bits / 8 * 2
+    s = hex(random.getrandbits(bits)).lstrip('0x').rstrip('L')
+    if len(s) < required_length:
+        return get_random_hash(bits)
+    else:
+        return s
+
+
+def files_are_same(file1, file2):
+    """
+    Checks whether two files in two paths are the same.
+    First by checking their sizes. If the sizes are equal, an MD5 checksum is calculated.
+    :param file1: first file
+    :param file2: second file
+    :return: True if files are the same, False otherwise
+    """
+    if file1 == file2:
+        return True
+
+    size1 = os.path.getsize(file1)
+    size2 = os.path.getsize(file2)
+    if size1 != size2:
+        return False
+
+    else:
+        md5file1 = md5_file(file1)
+        md5file2 = md5_file(file2)
+        return md5file1 == md5file2
+
+
+def rename_file_with_number(file_name, num):
+    """
+    Returns the same file name, but with a number added.
+    myfile.doc becomes myfile_1.doc or myfile_num.doc
+    :param file_name: file to be renamed
+    :param num: number to add to filename
+    :return: renamed file name
+    """
+    file_name_parts = file_name.split(".")
+    return "".join(file_name_parts[0:-1]) + "_" + str(num) + "." + file_name_parts[-1]
+
+
+def download_file(url, to_dir):
+
+    """
+    A function that downloads a file to a specific directory.
+    If the file already exists, it tries to add a number to the file name.
+    If the same file exists, the file is not downloaded again.
+    The check whether it's the same file is done through file size and md5 checksum
+    :param url: url of the file to be downloaded
+    :param to_dir: directory, to which the file should be downloaded
+    :return: None
+    """
+
+    file_name = os.path.normpath(url.split('/')[-1]).replace(" ","")
+    file_name = re.sub('[^\w_.)( -]', '', file_name) # remove invalid characters from filename
+    try:
+        with urllib.request.urlopen(url) as response:
+            data = response.read()
+            info = response.info()
+            ext = info.get_content_subtype()
+            if len(file_name) < len(ext) or file_name[-len(ext):] != ext: # add file extension if not present
+                file_name += "." + ext
+    except urllib.error.URLError:
+        sys.stderr.write("Skipping invalid URL, or possibly failed to get URL: " + url)
+        return
+
+    target_path = os.path.join(to_dir, file_name)
+    target_temp_path = os.path.join(to_dir, file_name + "_" + get_random_hash())
+
+    with open(target_temp_path, 'wb') as f:
+        f.write(data)
+
+    # check if a file with the same name exists already
+    if os.path.isfile(target_path):
+        # if file exists, compare with the downloaded file
+        if files_are_same(target_path, target_temp_path):
+            # if it's the same file, just remove the temp file and return
+            os.remove(target_temp_path)
+            return
+        else:
+            # if it's not the same file, loop over new file names with numbers, and redo the file name check
+            idx = 0
+            while True:
+                idx += 1
+                num_file_name = rename_file_with_number(file_name, idx)
+                target_path = os.path.join(to_dir, num_file_name)
+                if os.path.isfile(target_path):
+                    if files_are_same(target_path, target_temp_path):
+                        os.remove(target_temp_path)
+                        return
+                else:
+                    break
+
+    # if the downloaded file will not overwrite anything, rename the temp file to its proper name
+    os.rename(target_temp_path, target_path)
+
+
+def _download_files(list_of_urls, to_dir):
+    """
+    Download list of urls to a directory sequentially
+    :param list_of_urls: list of urls to download
+    :param to_dir: destination directory
+    :return: None
+    """
+    for url in list_of_urls:
+        download_file(url, to_dir)
+
+
+def download_files(list_of_urls, to_dir, processes=0):
+    """
+    Downloads a list of urls in parallel if possible, otherwise sequentially
+    :param list_of_urls: list of urls to download
+    :param to_dir: destination directory
+    :param processes: number of processes/threads
+    :return: None
+    """
+
+    # clean spaces, tabs and new-lines
+    list_of_urls = [line.replace(' ', '').replace('\n', '').replace('\t', '') for line in list_of_urls]
+    if not os.path.isdir(to_dir):
+        mkdir_p(to_dir)
+    if processes <= 0:
+        try:
+            processes = mp.cpu_count()
+        except NotImplementedError as e:
+            sys.stderr.write("Unable to determine the number of CPUs for parallelization. Proceeding sequentially. "
+                             "Consider inputting the number of CPUs manually.\n")
+            _download_files(list_of_urls, to_dir)
+            return
+    elif processes == 1 or len(list_of_urls) == 1:
+        _download_files(list_of_urls, to_dir)
+        return
+    elif processes > len(list_of_urls):
+        processes = len(list_of_urls)
+
+    params = [(list_of_urls[i], to_dir) for i in range(len(list_of_urls))]
+    pool = mp.Pool(processes)
+    pool.starmap(download_file, params)
+    pool.close()
+    pool.join()
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e62a725
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,17 @@
+from distutils.core import setup
+import os 
+
+del os.link
+
+setup(
+    name="listdownloader",
+    version="0.1.0",
+    author="Samer Afach",
+    author_email="samer@afach.de",
+    packages=["listdownloader"],
+    include_package_data=True,
+    url="https://git.afach.de/samerafach/ListDownloader",
+    description="Downloads a list of files",
+    install_requires=[],
+    scripts=['bin/downloadlist.py']
+)