Source code for carpedm.data.download

# -*- coding: utf-8 -*-
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.


"""Download scripts.

This module provides the interface for downloading raw datasets from
their source.

.. include:: ../../../DESCRIPTION.rst
    :start-after: machine learning researchers.
    :end-before: Though still in the early stages

Example:
    Data may be downloaded externally using the provided script:

    .. code-block:: bash

        $ download_data --data-dir <download/to/this/directory> --data-id pmjtc

.. note::

    If an expected data subdirectory already exists in the
    specified target ``data-dir`` that data will not be downloaded, even
    if the subdirectory is empty. This should be fixed in a future
    version.

Todo:

    * Update ``get_books_list`` once list is included in downloadables.
    * Check subdirectory contents.
    * Generalize download structure for other datasets.

..  _Dataset of Pre-Modern Japanese Text Character Shapes:
    http://codh.rois.ac.jp/char-shape/
"""
import argparse
import os
import zipfile


# Base URLS for datasets
_URLS = {
    'pmjtc': "http://codh.rois.ac.jp/char-shape/book/",
}
_BOOKS = {
    'pmjtc': [
        # "bib_id",  # num_page_images, types, tokens
        "200003076",  # 346, 1720, 63959
        "200003967",  # 88, 1119, 11197
        "200014740",  # 182, 1969, 44832
        "200021637",  # 37,  417,  4871
        "200021660",  # 185, 1758, 32525
        "200021712",  # 165,  843, 24480
        "200021763",  # 100,  704, 11397
        "200021802",  # 111,  560, 19575
        "200021851",  # 59,  430,  5599
        "200021853",  # 79,  595,  9046
        "200021869",  # 35,  330,  3003
        "200021925",  # 45,  693,  4259
        "200022050",  # 30,  255,  9545
        "brsk00000",  # 238, 2197, 75462
        "hnsd00000",  # 522, 1972, 83492
    ]
}


[docs]def get_books_list(dataset='pmjtc'): """Retrieve list of books/images in dataset. Args: dataset (str): Identifier for dataset for which to retrieve information. Returns: :obj:`list` of :obj:`str`: Names of dataset subdirectories and/or files. """ return _BOOKS[dataset]
[docs]def maybe_download(directory, dataset='pmjtc'): """Download character dataset if BOOKS not in directory. Args: directory (str): Directory where dataset is located or should be saved. dataset (str): Identifier for dataset to download. """ from urllib.request import urlretrieve if not os.path.isdir(directory): os.makedirs(directory) for bib_id in get_books_list(): if not os.path.exists(os.path.join(directory, bib_id)): print("Could not find %s in %s" % (bib_id, directory)) filename = bib_id + '.zip' filepath = os.path.join(directory, filename) if not os.path.exists(filename): url = os.path.join(_URLS[dataset], bib_id, filename) print("Downloading %s to %s" % (url, filepath)) inprogress_filepath = filepath + ".incomplete" inprogress_filepath, _ = urlretrieve(url, inprogress_filepath) os.rename(inprogress_filepath, filepath) statinfo = os.stat(filepath) print("Successfully downloaded %s, %s bytes." % (filename, statinfo.st_size)) unzip_dir = os.path.join(directory, filename.strip(".zip")) if not os.path.exists(unzip_dir): print("Unzipping files...", end='', flush=True) zipfile.ZipFile(filepath, "r").extractall(directory) print("done.")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--data-dir', type=str, required=True, help="""\ Directory to which dataset books will be downloaded if not already present.\ """) parser.add_argument('-i', '--data-id', type=str, default='pmjtc', choices=['pmjtc'], help="Identifier for dataset to download.") args = parser.parse_args() maybe_download(args.data_dir, args.data_id)