Source code for carpedm.data.download

# -*- coding: utf-8 -*-
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.


"""Download scripts.

This module provides the interface for downloading raw datasets from
their source.

.. include:: ../../../DESCRIPTION.rst
    :start-after: machine learning researchers.
    :end-before: Though still in the early stages

Example:
    Data may be downloaded externally using the provided script:

    .. code-block:: bash

        $ download_data --data-dir <download/to/this/directory> --data-id pmjtc

.. note::

    If an expected data subdirectory already exists in the
    specified target ``data-dir`` that data will not be downloaded, even
    if the subdirectory is empty. This should be fixed in a future
    version.

Todo:

    * Update ``get_books_list`` once list is included in downloadables.
    * Check subdirectory contents.
    * Generalize download structure for other datasets.

..  _Dataset of Pre-Modern Japanese Text Character Shapes:
    http://codh.rois.ac.jp/char-shape/
"""
import argparse
import os
import zipfile


# Base URLS for datasets
_URLS = {
    'pmjtc': "http://codh.rois.ac.jp/char-shape/book/",
}
_BOOKS = {
    'pmjtc': [
        # "bib_id",  # num_page_images, types, tokens
        "200003076",  # 346, 1720, 63959
        "200003967",  # 88, 1119, 11197
        "200014740",  # 182, 1969, 44832
        "200021637",  # 37,  417,  4871
        "200021660",  # 185, 1758, 32525
        "200021712",  # 165,  843, 24480
        "200021763",  # 100,  704, 11397
        "200021802",  # 111,  560, 19575
        "200021851",  # 59,  430,  5599
        "200021853",  # 79,  595,  9046
        "200021869",  # 35,  330,  3003
        "200021925",  # 45,  693,  4259
        "200022050",  # 30,  255,  9545
        "brsk00000",  # 238, 2197, 75462
        "hnsd00000",  # 522, 1972, 83492
    ]
}


[docs]def get_books_list(dataset='pmjtc'):
    """Retrieve list of books/images in dataset.

    Args:
        dataset (str): Identifier for dataset for which to retrieve
            information.

    Returns:
        :obj:`list` of :obj:`str`: Names of dataset subdirectories
        and/or files.
    """
    return _BOOKS[dataset]


[docs]def maybe_download(directory, dataset='pmjtc'):
    """Download character dataset if BOOKS not in directory.

    Args:
        directory (str): Directory where dataset is located or
            should be saved.
        dataset (str): Identifier for dataset to download.

    """
    from urllib.request import urlretrieve

    if not os.path.isdir(directory):
        os.makedirs(directory)

    for bib_id in get_books_list():
        if not os.path.exists(os.path.join(directory, bib_id)):
            print("Could not find %s in %s" % (bib_id, directory))
            filename = bib_id + '.zip'
            filepath = os.path.join(directory, filename)
            if not os.path.exists(filename):
                url = os.path.join(_URLS[dataset], bib_id, filename)
                print("Downloading %s to %s" % (url, filepath))
                inprogress_filepath = filepath + ".incomplete"
                inprogress_filepath, _ = urlretrieve(url, inprogress_filepath)

                os.rename(inprogress_filepath, filepath)
                statinfo = os.stat(filepath)
                print("Successfully downloaded %s, %s bytes." %
                      (filename, statinfo.st_size))

            unzip_dir = os.path.join(directory, filename.strip(".zip"))
            if not os.path.exists(unzip_dir):
                print("Unzipping files...", end='', flush=True)
                zipfile.ZipFile(filepath, "r").extractall(directory)
                print("done.")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--data-dir', type=str, required=True,
                        help="""\
                        Directory to which dataset books will be 
                        downloaded if not already present.\
                        """)
    parser.add_argument('-i', '--data-id', type=str, default='pmjtc',
                        choices=['pmjtc'],
                        help="Identifier for dataset to download.")
    args = parser.parse_args()
    maybe_download(args.data_dir, args.data_id)