Module topicnet.cooking_machine.model_constructor
Expand source code
import warnings
from typing import (
Dict,
List,
)
import artm
from .dataset import Dataset
from .rel_toolbox_lite import (
count_vocab_size,
modality_weight_rel2abs,
)
def add_standard_scores(
model: artm.ARTM,
dictionary: artm.Dictionary = None,
main_modality: str = "@lemmatized",
all_modalities: List[str] = ("@lemmatized", "@ngramms")
) -> None:
"""
Adds standard scores for the model.
Parameters
----------
model
dictionary
Obsolete parameter, not used
main_modality
all_modalities
"""
assert main_modality in all_modalities, "main_modality must be part of all_modalities"
if dictionary is not None:
warnings.warn(
'Parameter `dictionary` is obsolete:'
' it is not used in the function "add_standard_scores"!'
)
model.scores.add(
artm.scores.PerplexityScore(
name='PerplexityScore@all',
class_ids=all_modalities,
)
)
model.scores.add(
artm.scores.SparsityThetaScore(name='SparsityThetaScore')
)
for modality in all_modalities:
model.scores.add(
artm.scores.SparsityPhiScore(
name=f'SparsityPhiScore{modality}',
class_id=modality,
)
)
model.scores.add(
artm.scores.PerplexityScore(
name=f'PerplexityScore{modality}',
class_ids=[modality],
)
)
model.scores.add(
artm.TopicKernelScore(
name=f'TopicKernel{modality}',
probability_mass_threshold=0.3,
class_id=modality,
)
)
def init_model(topic_names, seed=None, class_ids=None):
"""
Creates basic artm model
"""
model = artm.ARTM(
topic_names=topic_names,
# Commented for performance uncomment if has zombie issues
# num_processors=3,
theta_columns_naming='title',
show_progress_bars=False,
class_ids=class_ids,
seed=seed
)
return model
def create_default_topics(specific_topics, background_topics):
"""
Creates list of background topics and specific topics
Parameters
----------
specific_topics : list or int
background_topics : list or int
Returns
-------
(list, list)
"""
# TODO: what if specific_topics = 4
# and background_topics = ["topic_0"] ?
if isinstance(specific_topics, list):
specific_topic_names = list(specific_topics)
else:
specific_topics = int(specific_topics)
specific_topic_names = [
f'topic_{i}'
for i in range(specific_topics)
]
n_specific_topics = len(specific_topic_names)
if isinstance(background_topics, list):
background_topic_names = list(background_topics)
else:
background_topics = int(background_topics)
background_topic_names = [
f'background_{n_specific_topics + i}'
for i in range(background_topics)
]
if set(specific_topic_names) & set(background_topic_names):
raise ValueError(
"Specific topic names and background topic names should be distinct from each other!"
)
return specific_topic_names, background_topic_names
def init_simple_default_model(
dataset: Dataset,
modalities_to_use: List[str] or Dict[str, float],
main_modality: str,
specific_topics: List[str] or int,
background_topics: List[str] or int,
) -> artm.ARTM:
"""
Creates simple `artm.ARTM` model with standard scores.
Parameters
----------
dataset
Dataset for model initialization
modalities_to_use
What modalities a model should know.
If `modalities_to_use` is a dictionary,
all given weights are assumed to be relative to `main_modality`:
weights will then be recalculated to absolute ones
using `dataset` and `main_modality`.
If `modalities_to_use` is a list,
then all relative weights are set equal to one.
The result model's `class_ids` field will contain absolute modality weights.
main_modality
Modality relative to which all modality weights are considered
specific_topics
Specific topic names or their number
background_topics
Background topic names or their number
Returns
-------
model : artm.ARTM
"""
if isinstance(modalities_to_use, dict):
modalities_weights = modalities_to_use
else:
modalities_weights = {class_id: 1 for class_id in modalities_to_use}
specific_topic_names, background_topic_names = create_default_topics(
specific_topics, background_topics
)
dictionary = dataset.get_dictionary()
tokens_data = count_vocab_size(dictionary, modalities_to_use)
abs_weights = modality_weight_rel2abs(
tokens_data,
modalities_weights,
main_modality
)
model = init_model(
topic_names=specific_topic_names + background_topic_names,
class_ids=abs_weights,
)
if len(background_topic_names) > 0:
model.regularizers.add(
artm.SmoothSparsePhiRegularizer(
name='smooth_phi_bcg',
topic_names=background_topic_names,
tau=0.0,
class_ids=[main_modality],
),
)
model.regularizers.add(
artm.SmoothSparseThetaRegularizer(
name='smooth_theta_bcg',
topic_names=background_topic_names,
tau=0.0,
),
)
model.initialize(dictionary)
add_standard_scores(model, main_modality=main_modality,
all_modalities=modalities_to_use)
return model
Functions
def add_standard_scores(model: artm.artm_model.ARTM, dictionary: artm.dictionary.Dictionary = None, main_modality: str = '@lemmatized', all_modalities: List[str] = ('@lemmatized', '@ngramms')) ‑> NoneType
-
Adds standard scores for the model.
Parameters
model
dictionary
- Obsolete parameter, not used
main_modality
all_modalities
Expand source code
def add_standard_scores( model: artm.ARTM, dictionary: artm.Dictionary = None, main_modality: str = "@lemmatized", all_modalities: List[str] = ("@lemmatized", "@ngramms") ) -> None: """ Adds standard scores for the model. Parameters ---------- model dictionary Obsolete parameter, not used main_modality all_modalities """ assert main_modality in all_modalities, "main_modality must be part of all_modalities" if dictionary is not None: warnings.warn( 'Parameter `dictionary` is obsolete:' ' it is not used in the function "add_standard_scores"!' ) model.scores.add( artm.scores.PerplexityScore( name='PerplexityScore@all', class_ids=all_modalities, ) ) model.scores.add( artm.scores.SparsityThetaScore(name='SparsityThetaScore') ) for modality in all_modalities: model.scores.add( artm.scores.SparsityPhiScore( name=f'SparsityPhiScore{modality}', class_id=modality, ) ) model.scores.add( artm.scores.PerplexityScore( name=f'PerplexityScore{modality}', class_ids=[modality], ) ) model.scores.add( artm.TopicKernelScore( name=f'TopicKernel{modality}', probability_mass_threshold=0.3, class_id=modality, ) )
def create_default_topics(specific_topics, background_topics)
-
Creates list of background topics and specific topics
Parameters
specific_topics
:list
orint
background_topics
:list
orint
Returns
(list, list)
Expand source code
def create_default_topics(specific_topics, background_topics): """ Creates list of background topics and specific topics Parameters ---------- specific_topics : list or int background_topics : list or int Returns ------- (list, list) """ # TODO: what if specific_topics = 4 # and background_topics = ["topic_0"] ? if isinstance(specific_topics, list): specific_topic_names = list(specific_topics) else: specific_topics = int(specific_topics) specific_topic_names = [ f'topic_{i}' for i in range(specific_topics) ] n_specific_topics = len(specific_topic_names) if isinstance(background_topics, list): background_topic_names = list(background_topics) else: background_topics = int(background_topics) background_topic_names = [ f'background_{n_specific_topics + i}' for i in range(background_topics) ] if set(specific_topic_names) & set(background_topic_names): raise ValueError( "Specific topic names and background topic names should be distinct from each other!" ) return specific_topic_names, background_topic_names
def init_model(topic_names, seed=None, class_ids=None)
-
Creates basic artm model
Expand source code
def init_model(topic_names, seed=None, class_ids=None): """ Creates basic artm model """ model = artm.ARTM( topic_names=topic_names, # Commented for performance uncomment if has zombie issues # num_processors=3, theta_columns_naming='title', show_progress_bars=False, class_ids=class_ids, seed=seed ) return model
def init_simple_default_model(dataset: Dataset, modalities_to_use: List[str], main_modality: str, specific_topics: List[str], background_topics: List[str]) ‑> artm.artm_model.ARTM
-
Creates simple
artm.ARTM
model with standard scores.Parameters
dataset
- Dataset for model initialization
modalities_to_use
-
What modalities a model should know. If
modalities_to_use
is a dictionary, all given weights are assumed to be relative tomain_modality
: weights will then be recalculated to absolute ones usingdataset
andmain_modality
. Ifmodalities_to_use
is a list, then all relative weights are set equal to one.The result model's
class_ids
field will contain absolute modality weights. main_modality
- Modality relative to which all modality weights are considered
specific_topics
- Specific topic names or their number
background_topics
- Background topic names or their number
Returns
model
:artm.ARTM
Expand source code
def init_simple_default_model( dataset: Dataset, modalities_to_use: List[str] or Dict[str, float], main_modality: str, specific_topics: List[str] or int, background_topics: List[str] or int, ) -> artm.ARTM: """ Creates simple `artm.ARTM` model with standard scores. Parameters ---------- dataset Dataset for model initialization modalities_to_use What modalities a model should know. If `modalities_to_use` is a dictionary, all given weights are assumed to be relative to `main_modality`: weights will then be recalculated to absolute ones using `dataset` and `main_modality`. If `modalities_to_use` is a list, then all relative weights are set equal to one. The result model's `class_ids` field will contain absolute modality weights. main_modality Modality relative to which all modality weights are considered specific_topics Specific topic names or their number background_topics Background topic names or their number Returns ------- model : artm.ARTM """ if isinstance(modalities_to_use, dict): modalities_weights = modalities_to_use else: modalities_weights = {class_id: 1 for class_id in modalities_to_use} specific_topic_names, background_topic_names = create_default_topics( specific_topics, background_topics ) dictionary = dataset.get_dictionary() tokens_data = count_vocab_size(dictionary, modalities_to_use) abs_weights = modality_weight_rel2abs( tokens_data, modalities_weights, main_modality ) model = init_model( topic_names=specific_topic_names + background_topic_names, class_ids=abs_weights, ) if len(background_topic_names) > 0: model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smooth_phi_bcg', topic_names=background_topic_names, tau=0.0, class_ids=[main_modality], ), ) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='smooth_theta_bcg', topic_names=background_topic_names, tau=0.0, ), ) model.initialize(dictionary) add_standard_scores(model, main_modality=main_modality, all_modalities=modalities_to_use) return model