Module topicnet.dataset_manager.api
Expand source code
import gzip
import os
import pandas as pd
import shutil
import ssl
import sys
import urllib
from glob import glob
from tqdm import tqdm
from urllib.request import (
Request,
urlopen,
)
from ..cooking_machine.dataset import Dataset
_SERVER_URL = 'https://topicnet-datasets.mil-team.ru'
_ARCHIVE_EXTENSION = '.gz'
_DEFAULT_DATASET_FILE_EXTENSION = '.csv'
def get_info() -> str:
"""
Gets info about all datasets.
Returns
-------
str with MarkDown syntax
Examples
--------
As the return value is MarkDown text,
in Jupyter Notebook one may do the following
to format the output information nicely
>>> from IPython.display import Markdown
...
>>> Markdown(get_info())
"""
req = Request(_SERVER_URL + '/info')
context = ssl._create_unverified_context()
with urlopen(req, context=context) as response:
return response.read().decode('utf-8')
def load_dataset(dataset_name: str, **kwargs) -> Dataset:
"""
Load dataset by dataset_name.
Run ``get_info()`` to get dataset information
Parameters
----------
dataset_name: str
dataset name for download
Another Parameters
------------------
kwargs
optional properties of
:class:`~topicnet.cooking_machine.Dataset`
"""
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_name)
try:
print(f'Checking if dataset "{dataset_name}" was already downloaded before')
saved_dataset = _init_dataset_if_downloaded(dataset_path, **kwargs)
except FileNotFoundError:
print(f'Dataset "{dataset_name}" not found on the machine')
else:
print(
f'Dataset is found on the machine.'
f' Save path is: "{saved_dataset._data_path}"'
)
return saved_dataset
req = Request(_SERVER_URL + '/download')
context = ssl._create_unverified_context()
values = {'dataset-name': dataset_name}
data = urllib.parse.urlencode(values).encode("utf-8")
print(f'Downloading the "{dataset_name}" dataset...')
save_path = None
try:
with urlopen(req, data=data, context=context) as answer:
total_size = int(answer.headers.get('content-length', 0))
block_size = 1024
save_path = dataset_path + answer.getheader('file-extension')
t = tqdm(total=total_size, unit='iB', unit_scale=True, file=sys.stdout)
with open(save_path + _ARCHIVE_EXTENSION, 'wb') as f:
while True:
chunk = answer.read(block_size)
if not chunk:
break
t.update(len(chunk))
f.write(chunk)
t.close()
if total_size != 0 and t.n != total_size:
raise RuntimeError(
"Failed to download the dataset!"
" Some data was lost during network transfer"
)
with gzip.open(save_path + _ARCHIVE_EXTENSION, 'rb') as file_in, open(save_path, 'wb') as file_out: # noqa E501
# more memory-efficient than plain file_in.read()
shutil.copyfileobj(file_in, file_out)
print(f'Dataset downloaded! Save path is: "{save_path}"')
return Dataset(save_path, **kwargs)
except Exception as exception:
if save_path is not None and os.path.isfile(save_path):
os.remove(save_path)
raise exception
finally:
if save_path is not None and os.path.isfile(save_path + _ARCHIVE_EXTENSION):
os.remove(save_path + _ARCHIVE_EXTENSION)
def _init_dataset_if_downloaded(dataset_path: str, **kwargs) -> Dataset:
saved_dataset_path_candidates = [
p for p in glob(dataset_path + '.*')
if os.path.isfile(p) and not p.endswith(_ARCHIVE_EXTENSION)
]
dataset = None
if len(saved_dataset_path_candidates) > 0:
saved_dataset_path = saved_dataset_path_candidates[0]
try:
dataset = Dataset(saved_dataset_path, **kwargs)
except pd.errors.EmptyDataError:
os.remove(saved_dataset_path)
if dataset is None:
raise FileNotFoundError()
return dataset
Functions
def get_info() ‑> str
-
Gets info about all datasets.
Returns
str with MarkDown syntax
Examples
As the return value is MarkDown text, in Jupyter Notebook one may do the following to format the output information nicely
>>> from IPython.display import Markdown ... >>> Markdown(get_info())
Expand source code
def get_info() -> str: """ Gets info about all datasets. Returns ------- str with MarkDown syntax Examples -------- As the return value is MarkDown text, in Jupyter Notebook one may do the following to format the output information nicely >>> from IPython.display import Markdown ... >>> Markdown(get_info()) """ req = Request(_SERVER_URL + '/info') context = ssl._create_unverified_context() with urlopen(req, context=context) as response: return response.read().decode('utf-8')
def load_dataset(dataset_name: str, **kwargs) ‑> Dataset
-
Load dataset by dataset_name. Run
get_info()
to get dataset informationParameters
dataset_name
:str
- dataset name for download
Another Parameters
kwargs optional properties of :class:
~topicnet.cooking_machine.Dataset
Expand source code
def load_dataset(dataset_name: str, **kwargs) -> Dataset: """ Load dataset by dataset_name. Run ``get_info()`` to get dataset information Parameters ---------- dataset_name: str dataset name for download Another Parameters ------------------ kwargs optional properties of :class:`~topicnet.cooking_machine.Dataset` """ dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_name) try: print(f'Checking if dataset "{dataset_name}" was already downloaded before') saved_dataset = _init_dataset_if_downloaded(dataset_path, **kwargs) except FileNotFoundError: print(f'Dataset "{dataset_name}" not found on the machine') else: print( f'Dataset is found on the machine.' f' Save path is: "{saved_dataset._data_path}"' ) return saved_dataset req = Request(_SERVER_URL + '/download') context = ssl._create_unverified_context() values = {'dataset-name': dataset_name} data = urllib.parse.urlencode(values).encode("utf-8") print(f'Downloading the "{dataset_name}" dataset...') save_path = None try: with urlopen(req, data=data, context=context) as answer: total_size = int(answer.headers.get('content-length', 0)) block_size = 1024 save_path = dataset_path + answer.getheader('file-extension') t = tqdm(total=total_size, unit='iB', unit_scale=True, file=sys.stdout) with open(save_path + _ARCHIVE_EXTENSION, 'wb') as f: while True: chunk = answer.read(block_size) if not chunk: break t.update(len(chunk)) f.write(chunk) t.close() if total_size != 0 and t.n != total_size: raise RuntimeError( "Failed to download the dataset!" " Some data was lost during network transfer" ) with gzip.open(save_path + _ARCHIVE_EXTENSION, 'rb') as file_in, open(save_path, 'wb') as file_out: # noqa E501 # more memory-efficient than plain file_in.read() shutil.copyfileobj(file_in, file_out) print(f'Dataset downloaded! Save path is: "{save_path}"') return Dataset(save_path, **kwargs) except Exception as exception: if save_path is not None and os.path.isfile(save_path): os.remove(save_path) raise exception finally: if save_path is not None and os.path.isfile(save_path + _ARCHIVE_EXTENSION): os.remove(save_path + _ARCHIVE_EXTENSION)