Module topicnet.cooking_machine.models.dummy_topic_model
Expand source code
import artm
import json
import os
import re
import warnings
from ..dataset import Dataset
from .topic_model import TopicModel
class InvalidOperationError(RuntimeError):
def __init__(self, message='Dummy model can\'t do this'):
super().__init__(message)
SIMPLE_ARTM_MODEL = artm.ARTM(num_topics=1, num_processors=1)
JSON_KEY_REGULARIZERS = 'regularizers'
JSON_KEY_CLASS_IDS = 'class_ids'
WARNING_ALREADY_DUMMY = 'Already dummy'
class DummyTopicModel(TopicModel):
_dummy_attribute = '_is_dummy'
def __init__(self,
scores,
init_parameters=None,
model_id=None,
parent_model_id=None,
description=None,
experiment=None,
save_path=None,
*args,
**kwargs):
"""
Notes
-----
Only TopicModel supposed to be able to create DummyTopicModel
("private" < access < "public")
"""
super().__init__(
artm_model=SIMPLE_ARTM_MODEL,
model_id=model_id,
parent_model_id=parent_model_id,
description=description,
experiment=experiment,
**kwargs,
)
self._model.dispose()
self._save_folder_path = save_path
self._model = _DummyArtmModel(self._save_folder_path)
self._init_parameters = init_parameters
self._scores = scores
setattr(self, DummyTopicModel._dummy_attribute, True)
def __getattr__(self, name):
# Don't redirect the stuff to artm_model (as TopicModel does)
if name in self._init_parameters:
return self._init_parameters[name]
raise AttributeError(f'Dummy model has no attribute "{name}"')
def get_init_parameters(self, not_include=None):
""""""
return self._init_parameters
@property
def scores(self):
""""""
return self._scores
@property
def regularizers(self):
""""""
return self._model.regularizers
@property
def class_ids(self):
""""""
return self._model.class_ids
@property
def _save_path(self):
return self._save_folder_path
@_save_path.setter
def _save_path(self, path):
self._save_folder_path = path
self._model._save_folder_path = path
def save(self, model_save_path=None, **kwargs):
""""""
# kwargs - for compatibility with super()'s method
# TODO: a bit copy-paste from TopicModel:
# can't call super()'s, because artm_model is being saved by default there
self._save_path = model_save_path or self.model_default_save_path
if not os.path.exists(self._save_path):
os.makedirs(self._save_path)
self.save_parameters(self._save_path)
@staticmethod
def load(path, experiment=None):
""""""
params = json.load(open(os.path.join(path, 'params.json'), 'r'))
model = DummyTopicModel(**params)
model.experiment = experiment
model._save_path = path
model._scores_wrapper._score_caches = params['scores']
return model
def restore(self, dataset: Dataset = None):
"""Restores dummy to original TopicModel
Tries to load the data from drive (if model was saved).
Otherwise tries to train the model using parent model, experiment and dataset.
Parameters
----------
dataset : Dataset
Dataset on which the model was trained.
If the original model was saved to drive, the parameter won't be used.
If not, dataset should be provided for training.
Returns
-------
TopicModel
Restored topic model
"""
# Not in-place, as TopicModel's make_dummy() because (seems like) TopicModel can be empty
# But it would be really strange if DummyTopicModel actually had all the stuff inside
topic_model = None
if self._save_path is not None:
topic_model = self._load_original_model()
if topic_model is None:
topic_model = self._train_to_original_model(dataset)
return topic_model
def to_dummy(self, save_to_drive=True, save_path=None, **kwargs):
warnings.warn(WARNING_ALREADY_DUMMY, UserWarning)
if save_to_drive:
self.save(save_path, **kwargs)
return self
def make_dummy(self, save_to_drive=True, save_path=None, **kwargs):
warnings.warn(WARNING_ALREADY_DUMMY, UserWarning)
if save_to_drive:
self.save(save_path, **kwargs)
def _load_original_model(self):
# TODO: custom_scores not restored currently
# modify model's save()-load() methods?
topic_model = None
try:
topic_model = super().load(
self._save_path,
self.experiment
)
except FileNotFoundError as e:
warnings.warn(f'Failed to read data from drive: "{e.args}"')
return topic_model
def _train_to_original_model(self, dataset: Dataset):
# TODO: refactor: big bunch of code, a lot of obscure and highly-likely-fo-fail places
# (parsing params, connecting one params with other params, restoring cube, running cube)
if len(self.description) == 0:
raise RuntimeError(
'Dummy model has empty description. So seems like nothing to restore'
)
if self.parent_model_id is None:
raise ValueError(
'Dummy model has no parent. Can\'t restore model in such a case'
)
if self.parent_model_id not in self.experiment.models:
raise ValueError(
f'Parent model "{self.parent_model_id}" not found in models '
f'associated with the experiment'
)
if dataset is None:
raise ValueError('Can\'t restore the model via training without dataset')
parent_model = self.experiment.models[self.parent_model_id]
if hasattr(parent_model, DummyTopicModel._dummy_attribute):
assert hasattr(parent_model, 'restore')
parent_model.restore(True, dataset) # also restore in experiment.models
delattr(parent_model, DummyTopicModel._dummy_attribute)
last_cube_description = self.description[-1]
# {
# 'action': 'reg_modifier',
# 'num_iter': 1,
# 'params': <some string with some description of regularizers>
# }
#
# Example of 'params' (it is string):
# "([<artm.regularizers.SmoothSparseThetaRegularizer object at 0x7faba8363ac8>,
# 'tau', 10.0],)"
# Currently need to parse the string with params
cube_parameters_from_description = last_cube_description['params']
cube_parameters_from_description = re.findall(
'\\[.*?\\]',
cube_parameters_from_description
)
cube_parameters_from_description = list(map(
lambda p: p[1:-1].split(', '),
cube_parameters_from_description
))
cube_parameters_from_description = list(map(
lambda p: dict(zip(['object', 'field', 'value'], p)),
cube_parameters_from_description
))
assert len(self.experiment.cubes) >= len(self.description)
last_cube_parameters = self.experiment.cubes[len(self.description) - 1]
# {
# 'action': 'reg_modifier',
# 'params': [
# {
# 'tau_grid': [0, 0.0],
# 'regularizer': { 'name': 'smooth_theta_bcg', 'tau': 1, ... }
# },
# ...
# ],
# 'cube': <Cube object>
# }
# For some reason some cubes seemed to not have this 'cube' parameter
# and not just the first two cubes
assert 'cube' in last_cube_parameters
cube = last_cube_parameters['cube']
# Example of cube.parameters:
# [
# { 'object': <Regularizer object>, 'field': 'tau', 'values': [0, 0.0] }
# ]
# TODO: assume order in cube.parameters is the same as in self.description[-1]['params]
# otherwise need to sort both lists?
for i in range(len(cube.parameters)):
assert str(cube.parameters[i]['object']) == \
cube_parameters_from_description[i]['object']
# one is object, another is string
cube.parameters[i]['values'] = float(
cube_parameters_from_description[i]['value']
)
cube_parameters_for_apply = list(
map(lambda p: list(p.values()), cube.parameters)
)
being_restored_model = cube.apply(
parent_model,
cube_parameters_for_apply,
dataset.get_dictionary()
)
being_restored_model._fit(
dataset_trainable=dataset.get_batch_vectorizer(),
num_iterations=cube.num_iter
)
model_cube = {
'action': cube.action,
'num_iter': cube.num_iter,
'params': repr(tuple(cube_parameters_for_apply)) # trying to make it look like before
}
being_restored_model.add_cube(model_cube) # restoring description
being_restored_model._model_id = self.model_id # using private field
return being_restored_model
class _DummyArtmModel:
def __init__(self, save_folder_path):
self.master = None
self._save_folder_path = save_folder_path
self._artm_params = None
def __getattr__(self, attr):
raise AttributeError(f'Dummy ARTM model doesn\'t have such attribute "{attr}"')
def dispose(self):
pass
@property
def regularizers(self):
""" """
assert JSON_KEY_REGULARIZERS in self._artm_parameters
return self._artm_parameters[JSON_KEY_REGULARIZERS]
@property
def class_ids(self):
""" """
assert JSON_KEY_CLASS_IDS in self._artm_parameters
return self._artm_parameters[JSON_KEY_CLASS_IDS]
def _load_artm_parameters(self):
if self._save_folder_path is None:
raise ValueError('Model has never been saved. Can\'t load parameters')
artm_parameters_file_path = os.path.join(
self._save_folder_path,
'model', # TODO: need some const-s for these names
'parameters.json'
)
if not os.path.isfile(artm_parameters_file_path):
raise FileNotFoundError(
f'File with artm model parameters not found on path "{artm_parameters_file_path}"')
return json.loads(
open(artm_parameters_file_path, 'r').read()
)
@property
def _artm_parameters(self):
if self._artm_params is None:
self._artm_params = self._load_artm_parameters()
return self._artm_params
def _fit(self, dataset_trainable, num_iterations):
raise InvalidOperationError()
def get_jsonable_from_parameters(self):
raise InvalidOperationError()
def clone(self):
raise InvalidOperationError()
def get_phi(self, *args, **kwargs):
raise InvalidOperationError()
def get_phi_dense(self, *args, **kwargs):
raise InvalidOperationError()
def get_phi_sparse(self, *args, **kwargs):
raise InvalidOperationError()
def get_theta(self, *args, **kwargs):
raise InvalidOperationError()
def add_cube(self, cube):
raise InvalidOperationError()
def describe_regularizers(self):
raise InvalidOperationError()
Classes
class DummyTopicModel (scores, init_parameters=None, model_id=None, parent_model_id=None, description=None, experiment=None, save_path=None, *args, **kwargs)
-
Topic Model contains artm model and all necessary information: scores, training pipeline, etc.
Notes
Only TopicModel supposed to be able to create DummyTopicModel ("private" < access < "public")
Expand source code
class DummyTopicModel(TopicModel): _dummy_attribute = '_is_dummy' def __init__(self, scores, init_parameters=None, model_id=None, parent_model_id=None, description=None, experiment=None, save_path=None, *args, **kwargs): """ Notes ----- Only TopicModel supposed to be able to create DummyTopicModel ("private" < access < "public") """ super().__init__( artm_model=SIMPLE_ARTM_MODEL, model_id=model_id, parent_model_id=parent_model_id, description=description, experiment=experiment, **kwargs, ) self._model.dispose() self._save_folder_path = save_path self._model = _DummyArtmModel(self._save_folder_path) self._init_parameters = init_parameters self._scores = scores setattr(self, DummyTopicModel._dummy_attribute, True) def __getattr__(self, name): # Don't redirect the stuff to artm_model (as TopicModel does) if name in self._init_parameters: return self._init_parameters[name] raise AttributeError(f'Dummy model has no attribute "{name}"') def get_init_parameters(self, not_include=None): """""" return self._init_parameters @property def scores(self): """""" return self._scores @property def regularizers(self): """""" return self._model.regularizers @property def class_ids(self): """""" return self._model.class_ids @property def _save_path(self): return self._save_folder_path @_save_path.setter def _save_path(self, path): self._save_folder_path = path self._model._save_folder_path = path def save(self, model_save_path=None, **kwargs): """""" # kwargs - for compatibility with super()'s method # TODO: a bit copy-paste from TopicModel: # can't call super()'s, because artm_model is being saved by default there self._save_path = model_save_path or self.model_default_save_path if not os.path.exists(self._save_path): os.makedirs(self._save_path) self.save_parameters(self._save_path) @staticmethod def load(path, experiment=None): """""" params = json.load(open(os.path.join(path, 'params.json'), 'r')) model = DummyTopicModel(**params) model.experiment = experiment model._save_path = path model._scores_wrapper._score_caches = params['scores'] return model def restore(self, dataset: Dataset = None): """Restores dummy to original TopicModel Tries to load the data from drive (if model was saved). Otherwise tries to train the model using parent model, experiment and dataset. Parameters ---------- dataset : Dataset Dataset on which the model was trained. If the original model was saved to drive, the parameter won't be used. If not, dataset should be provided for training. Returns ------- TopicModel Restored topic model """ # Not in-place, as TopicModel's make_dummy() because (seems like) TopicModel can be empty # But it would be really strange if DummyTopicModel actually had all the stuff inside topic_model = None if self._save_path is not None: topic_model = self._load_original_model() if topic_model is None: topic_model = self._train_to_original_model(dataset) return topic_model def to_dummy(self, save_to_drive=True, save_path=None, **kwargs): warnings.warn(WARNING_ALREADY_DUMMY, UserWarning) if save_to_drive: self.save(save_path, **kwargs) return self def make_dummy(self, save_to_drive=True, save_path=None, **kwargs): warnings.warn(WARNING_ALREADY_DUMMY, UserWarning) if save_to_drive: self.save(save_path, **kwargs) def _load_original_model(self): # TODO: custom_scores not restored currently # modify model's save()-load() methods? topic_model = None try: topic_model = super().load( self._save_path, self.experiment ) except FileNotFoundError as e: warnings.warn(f'Failed to read data from drive: "{e.args}"') return topic_model def _train_to_original_model(self, dataset: Dataset): # TODO: refactor: big bunch of code, a lot of obscure and highly-likely-fo-fail places # (parsing params, connecting one params with other params, restoring cube, running cube) if len(self.description) == 0: raise RuntimeError( 'Dummy model has empty description. So seems like nothing to restore' ) if self.parent_model_id is None: raise ValueError( 'Dummy model has no parent. Can\'t restore model in such a case' ) if self.parent_model_id not in self.experiment.models: raise ValueError( f'Parent model "{self.parent_model_id}" not found in models ' f'associated with the experiment' ) if dataset is None: raise ValueError('Can\'t restore the model via training without dataset') parent_model = self.experiment.models[self.parent_model_id] if hasattr(parent_model, DummyTopicModel._dummy_attribute): assert hasattr(parent_model, 'restore') parent_model.restore(True, dataset) # also restore in experiment.models delattr(parent_model, DummyTopicModel._dummy_attribute) last_cube_description = self.description[-1] # { # 'action': 'reg_modifier', # 'num_iter': 1, # 'params': <some string with some description of regularizers> # } # # Example of 'params' (it is string): # "([<artm.regularizers.SmoothSparseThetaRegularizer object at 0x7faba8363ac8>, # 'tau', 10.0],)" # Currently need to parse the string with params cube_parameters_from_description = last_cube_description['params'] cube_parameters_from_description = re.findall( '\\[.*?\\]', cube_parameters_from_description ) cube_parameters_from_description = list(map( lambda p: p[1:-1].split(', '), cube_parameters_from_description )) cube_parameters_from_description = list(map( lambda p: dict(zip(['object', 'field', 'value'], p)), cube_parameters_from_description )) assert len(self.experiment.cubes) >= len(self.description) last_cube_parameters = self.experiment.cubes[len(self.description) - 1] # { # 'action': 'reg_modifier', # 'params': [ # { # 'tau_grid': [0, 0.0], # 'regularizer': { 'name': 'smooth_theta_bcg', 'tau': 1, ... } # }, # ... # ], # 'cube': <Cube object> # } # For some reason some cubes seemed to not have this 'cube' parameter # and not just the first two cubes assert 'cube' in last_cube_parameters cube = last_cube_parameters['cube'] # Example of cube.parameters: # [ # { 'object': <Regularizer object>, 'field': 'tau', 'values': [0, 0.0] } # ] # TODO: assume order in cube.parameters is the same as in self.description[-1]['params] # otherwise need to sort both lists? for i in range(len(cube.parameters)): assert str(cube.parameters[i]['object']) == \ cube_parameters_from_description[i]['object'] # one is object, another is string cube.parameters[i]['values'] = float( cube_parameters_from_description[i]['value'] ) cube_parameters_for_apply = list( map(lambda p: list(p.values()), cube.parameters) ) being_restored_model = cube.apply( parent_model, cube_parameters_for_apply, dataset.get_dictionary() ) being_restored_model._fit( dataset_trainable=dataset.get_batch_vectorizer(), num_iterations=cube.num_iter ) model_cube = { 'action': cube.action, 'num_iter': cube.num_iter, 'params': repr(tuple(cube_parameters_for_apply)) # trying to make it look like before } being_restored_model.add_cube(model_cube) # restoring description being_restored_model._model_id = self.model_id # using private field return being_restored_model
Ancestors
Instance variables
var class_ids
-
Expand source code
@property def class_ids(self): """""" return self._model.class_ids
Methods
def get_init_parameters(self, not_include=None)
-
Expand source code
def get_init_parameters(self, not_include=None): """""" return self._init_parameters
def restore(self, dataset: Dataset = None)
-
Restores dummy to original TopicModel
Tries to load the data from drive (if model was saved). Otherwise tries to train the model using parent model, experiment and dataset.
Parameters
dataset
:Dataset
- Dataset on which the model was trained. If the original model was saved to drive, the parameter won't be used. If not, dataset should be provided for training.
Returns
TopicModel
- Restored topic model
Expand source code
def restore(self, dataset: Dataset = None): """Restores dummy to original TopicModel Tries to load the data from drive (if model was saved). Otherwise tries to train the model using parent model, experiment and dataset. Parameters ---------- dataset : Dataset Dataset on which the model was trained. If the original model was saved to drive, the parameter won't be used. If not, dataset should be provided for training. Returns ------- TopicModel Restored topic model """ # Not in-place, as TopicModel's make_dummy() because (seems like) TopicModel can be empty # But it would be really strange if DummyTopicModel actually had all the stuff inside topic_model = None if self._save_path is not None: topic_model = self._load_original_model() if topic_model is None: topic_model = self._train_to_original_model(dataset) return topic_model
Inherited members
class InvalidOperationError (message="Dummy model can't do this")
-
Unspecified run-time error.
Expand source code
class InvalidOperationError(RuntimeError): def __init__(self, message='Dummy model can\'t do this'): super().__init__(message)
Ancestors
- builtins.RuntimeError
- builtins.Exception
- builtins.BaseException