Module topicnet.cooking_machine.recipes.intratext_coherence_pipeline
Expand source code
import os
import warnings
from typing import List
from .recipe_wrapper import BaseRecipe
from .. import Dataset
ONE_CONFIG_INDENT = 4 * ' '
class IntratextCoherenceRecipe(BaseRecipe):
"""
The recipe mainly consists of basic cube stages,
such as Decorrelation, Sparsing and Smoothing.
In this way it is similar to ARTM baseline recipe.
The core difference is that models selected based on their IntratextCoherenceScore
(which is one of the scores included in TopicNet).
PerplexityScore is also calculated to assure that models don't have high perplexity,
but the main criteria is IntratextCoherenceScore.
For more details about IntratextCoherence
one may see the paper http://www.dialog-21.ru/media/4281/alekseevva.pdf
"""
def __init__(self):
recipe_template_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'intratext_coherence_maximization.yml'
)
recipe_template = open(recipe_template_path, 'r').read()
super().__init__(recipe_template=recipe_template)
def format_recipe(
self,
dataset_path: str,
num_specific_topics: int,
main_modality: str = None,
dictionary_filter_parameters: dict = None,
num_background_topics: int = 1,
modalities: List[str] = None,
keep_dataset_in_memory: bool = True,
keep_dataset: bool = False,
documents_fraction: float = 0.5,
one_stage_num_iter: int = 20,
verbose: bool = True) -> str:
"""
Parameters
----------
dataset_path
Path to the dataset .csv file
num_specific_topics
Number of specific topics in models to be trained
main_modality
Main modality in the dataset
(usually it is plain text, and not, for example, @author or @title)
If not specified, it will be the first modality in `modalities`
num_background_topics
Number of background topics in models
modalities
What modalities to use from those that are in the dataset.
If not specified, all dataset's modalities will be used.
If specified, should be non empty
keep_dataset_in_memory
Whether or not to keep dataset in memory when running experiment.
True is faster, so, if dataset is not very huge, it is better to use True
keep_dataset
If True, the dataset will be loaded in memory only when computing coherence.
So, memory will be free of the dataset during model training.
This may help if the dataset is fairly big,
but `keep_dataset_in_memory=True` still works without crash.
documents_fraction
Determines the number of documents that will be used for computing coherence.
Better keep this one less than 1.0.
For example, suppose we want to use not all dataset,
but just a fragment of 25,000 words.
Then we can do like so
>>> document_lengths = dataset._data['vw_text'].apply(lambda text: len(text.split()))
>>> median_document_length = np.median(document_lengths)
>>> num_documents = dataset._data.shape[0]
>>> dataset_fragment_length = 25000
>>> num_documents_for_computing = dataset_fragment_length / median_document_length
>>> documents_fraction = num_documents_for_computing / num_documents
one_stage_num_iter
There will be five stages, each with nearly 5-values-grid search.
One such search lasts `one_stage_num_iter` iterations
with coherence computation in the end.
So, there is going to be `one_stage_num_iter` * 5 * 5 training iterations (not slow)
and 5 * 5 coherence computations (here may be slow if `documents_fraction` is high)
verbose
Whether to show experiment progress or not
"""
all_modalities = list(Dataset(dataset_path).get_possible_modalities())
if len(all_modalities) == 0:
warnings.warn(f'No modalities in the dataset "{dataset_path}"!')
if modalities is None:
modalities = all_modalities
if any([m not in all_modalities for m in modalities]):
warnings.warn(f'Not all `modalities` are found in the dataset "{dataset_path}"!')
if main_modality is None:
main_modality = modalities[0]
warnings.warn(
f'Main modality not specified!'
f' So modality "{main_modality}" will be used as the main one'
)
specific_topics = [
f'topic_{i}' for i in range(num_specific_topics)
]
background_topics = [
f'bcg_topic_{i}'
for i in range(num_specific_topics, num_specific_topics + num_background_topics)
]
if dictionary_filter_parameters is None:
dictionary_filter_parameters = dict()
dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters(
dictionary_filter_parameters,
indent=2 * ONE_CONFIG_INDENT,
)
self._recipe = self.recipe_template.format(
modality_names=modalities,
main_modality=main_modality,
dataset_path=dataset_path,
dictionary_filter_parameters=dictionary_filter_parameters_as_yml,
keep_dataset_in_memory=keep_dataset_in_memory,
keep_dataset=keep_dataset,
documents_fraction=documents_fraction,
specific_topics=specific_topics,
background_topics=background_topics,
one_stage_num_iter=one_stage_num_iter,
verbose=verbose,
)
return self._recipe
Classes
class IntratextCoherenceRecipe
-
The recipe mainly consists of basic cube stages, such as Decorrelation, Sparsing and Smoothing. In this way it is similar to ARTM baseline recipe. The core difference is that models selected based on their IntratextCoherenceScore (which is one of the scores included in TopicNet). PerplexityScore is also calculated to assure that models don't have high perplexity, but the main criteria is IntratextCoherenceScore.
For more details about IntratextCoherence one may see the paper http://www.dialog-21.ru/media/4281/alekseevva.pdf
Expand source code
class IntratextCoherenceRecipe(BaseRecipe): """ The recipe mainly consists of basic cube stages, such as Decorrelation, Sparsing and Smoothing. In this way it is similar to ARTM baseline recipe. The core difference is that models selected based on their IntratextCoherenceScore (which is one of the scores included in TopicNet). PerplexityScore is also calculated to assure that models don't have high perplexity, but the main criteria is IntratextCoherenceScore. For more details about IntratextCoherence one may see the paper http://www.dialog-21.ru/media/4281/alekseevva.pdf """ def __init__(self): recipe_template_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'intratext_coherence_maximization.yml' ) recipe_template = open(recipe_template_path, 'r').read() super().__init__(recipe_template=recipe_template) def format_recipe( self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) -> str: """ Parameters ---------- dataset_path Path to the dataset .csv file num_specific_topics Number of specific topics in models to be trained main_modality Main modality in the dataset (usually it is plain text, and not, for example, @author or @title) If not specified, it will be the first modality in `modalities` num_background_topics Number of background topics in models modalities What modalities to use from those that are in the dataset. If not specified, all dataset's modalities will be used. If specified, should be non empty keep_dataset_in_memory Whether or not to keep dataset in memory when running experiment. True is faster, so, if dataset is not very huge, it is better to use True keep_dataset If True, the dataset will be loaded in memory only when computing coherence. So, memory will be free of the dataset during model training. This may help if the dataset is fairly big, but `keep_dataset_in_memory=True` still works without crash. documents_fraction Determines the number of documents that will be used for computing coherence. Better keep this one less than 1.0. For example, suppose we want to use not all dataset, but just a fragment of 25,000 words. Then we can do like so >>> document_lengths = dataset._data['vw_text'].apply(lambda text: len(text.split())) >>> median_document_length = np.median(document_lengths) >>> num_documents = dataset._data.shape[0] >>> dataset_fragment_length = 25000 >>> num_documents_for_computing = dataset_fragment_length / median_document_length >>> documents_fraction = num_documents_for_computing / num_documents one_stage_num_iter There will be five stages, each with nearly 5-values-grid search. One such search lasts `one_stage_num_iter` iterations with coherence computation in the end. So, there is going to be `one_stage_num_iter` * 5 * 5 training iterations (not slow) and 5 * 5 coherence computations (here may be slow if `documents_fraction` is high) verbose Whether to show experiment progress or not """ all_modalities = list(Dataset(dataset_path).get_possible_modalities()) if len(all_modalities) == 0: warnings.warn(f'No modalities in the dataset "{dataset_path}"!') if modalities is None: modalities = all_modalities if any([m not in all_modalities for m in modalities]): warnings.warn(f'Not all `modalities` are found in the dataset "{dataset_path}"!') if main_modality is None: main_modality = modalities[0] warnings.warn( f'Main modality not specified!' f' So modality "{main_modality}" will be used as the main one' ) specific_topics = [ f'topic_{i}' for i in range(num_specific_topics) ] background_topics = [ f'bcg_topic_{i}' for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] if dictionary_filter_parameters is None: dictionary_filter_parameters = dict() dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( dictionary_filter_parameters, indent=2 * ONE_CONFIG_INDENT, ) self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, specific_topics=specific_topics, background_topics=background_topics, one_stage_num_iter=one_stage_num_iter, verbose=verbose, ) return self._recipe
Ancestors
Methods
def format_recipe(self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) ‑> str
-
Parameters
dataset_path
- Path to the dataset .csv file
num_specific_topics
- Number of specific topics in models to be trained
main_modality
- Main modality in the dataset
(usually it is plain text, and not, for example, @author or @title)
If not specified, it will be the first modality in
modalities
num_background_topics
- Number of background topics in models
modalities
- What modalities to use from those that are in the dataset. If not specified, all dataset's modalities will be used. If specified, should be non empty
keep_dataset_in_memory
- Whether or not to keep dataset in memory when running experiment. True is faster, so, if dataset is not very huge, it is better to use True
keep_dataset
- If True, the dataset will be loaded in memory only when computing coherence.
So, memory will be free of the dataset during model training.
This may help if the dataset is fairly big,
but
keep_dataset_in_memory=True
still works without crash. documents_fraction
-
Determines the number of documents that will be used for computing coherence. Better keep this one less than 1.0. For example, suppose we want to use not all dataset, but just a fragment of 25,000 words. Then we can do like so
document_lengths = dataset._data['vw_text'].apply(lambda text: len(text.split())) median_document_length = np.median(document_lengths) num_documents = dataset._data.shape[0] dataset_fragment_length = 25000 num_documents_for_computing = dataset_fragment_length / median_document_length documents_fraction = num_documents_for_computing / num_documents
one_stage_num_iter
- There will be five stages, each with nearly 5-values-grid search.
One such search lasts
one_stage_num_iter
iterations with coherence computation in the end. So, there is going to beone_stage_num_iter
* 5 * 5 training iterations (not slow) and 5 * 5 coherence computations (here may be slow ifdocuments_fraction
is high) verbose
- Whether to show experiment progress or not
Expand source code
def format_recipe( self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) -> str: """ Parameters ---------- dataset_path Path to the dataset .csv file num_specific_topics Number of specific topics in models to be trained main_modality Main modality in the dataset (usually it is plain text, and not, for example, @author or @title) If not specified, it will be the first modality in `modalities` num_background_topics Number of background topics in models modalities What modalities to use from those that are in the dataset. If not specified, all dataset's modalities will be used. If specified, should be non empty keep_dataset_in_memory Whether or not to keep dataset in memory when running experiment. True is faster, so, if dataset is not very huge, it is better to use True keep_dataset If True, the dataset will be loaded in memory only when computing coherence. So, memory will be free of the dataset during model training. This may help if the dataset is fairly big, but `keep_dataset_in_memory=True` still works without crash. documents_fraction Determines the number of documents that will be used for computing coherence. Better keep this one less than 1.0. For example, suppose we want to use not all dataset, but just a fragment of 25,000 words. Then we can do like so >>> document_lengths = dataset._data['vw_text'].apply(lambda text: len(text.split())) >>> median_document_length = np.median(document_lengths) >>> num_documents = dataset._data.shape[0] >>> dataset_fragment_length = 25000 >>> num_documents_for_computing = dataset_fragment_length / median_document_length >>> documents_fraction = num_documents_for_computing / num_documents one_stage_num_iter There will be five stages, each with nearly 5-values-grid search. One such search lasts `one_stage_num_iter` iterations with coherence computation in the end. So, there is going to be `one_stage_num_iter` * 5 * 5 training iterations (not slow) and 5 * 5 coherence computations (here may be slow if `documents_fraction` is high) verbose Whether to show experiment progress or not """ all_modalities = list(Dataset(dataset_path).get_possible_modalities()) if len(all_modalities) == 0: warnings.warn(f'No modalities in the dataset "{dataset_path}"!') if modalities is None: modalities = all_modalities if any([m not in all_modalities for m in modalities]): warnings.warn(f'Not all `modalities` are found in the dataset "{dataset_path}"!') if main_modality is None: main_modality = modalities[0] warnings.warn( f'Main modality not specified!' f' So modality "{main_modality}" will be used as the main one' ) specific_topics = [ f'topic_{i}' for i in range(num_specific_topics) ] background_topics = [ f'bcg_topic_{i}' for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] if dictionary_filter_parameters is None: dictionary_filter_parameters = dict() dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( dictionary_filter_parameters, indent=2 * ONE_CONFIG_INDENT, ) self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, specific_topics=specific_topics, background_topics=background_topics, one_stage_num_iter=one_stage_num_iter, verbose=verbose, ) return self._recipe
Inherited members