Module topicnet.cooking_machine.recipes.multimodal_exploratory_search_pipeline
Expand source code
from typing import List, Union, Dict
from .recipe_wrapper import BaseRecipe
from .. import Dataset
multimodal_search_template = '''
# This config modifies a strategy described in the article
# Multi-objective Topic Modeling for Exploratory Search in Tech News
# by Anastasya Yanina, Lev Golitsyn and Konstantin Vorontsov, Jan 2018
# Use .format_recipe(modality_list=modality_list, modality=modality,
# dataset_path=dataset_path, specific_topics=specific_topics,
# background_topics=background_topics, num_iter=num_iter)
# when loading the recipe to adjust for your dataset
topics:
# Describes number of model topics, in the actuall article 200 topics were found to be optimal
specific_topics: {specific_topics}
background_topics: {background_topics}
regularizers:
{syntesized_regularizers}
- SmoothSparseThetaRegularizer:
name: sparse_theta
topic_names: specific_topics
tau: 1
model:
dataset_path: {dataset_path}
{modalities_description}
main_modality: '{modality}'
stages:
{syntesized_stages}
'''
decorrelator_reg_template = '''
- DecorrelatorPhiRegularizer:
name: decorrelation_phi_{modality}
topic_names: specific_topics
tau: 1
class_ids: ['{modality}']
'''
sparse_phi_reg_template = '''
- SmoothSparsePhiRegularizer:
name: smooth_phi_{modality}
topic_names: specific_topics
tau: 1
class_ids: ['{modality}']
'''
sparse_theta_cube_template = '''
- RegularizersModifierCube:
num_iter: {{num_iter}}
reg_search: add
regularizer_parameters:
name: sparse_theta
selection:
- {0}
strategy: PerplexityStrategy
strategy_params:
start_point: -0.3
step: 0.01
max_len: 20
tracked_score_function: PerplexityScore@all
verbose: false
use_relative_coefficients: True
'''.format('PerplexityScore@all < 1.01 * MINIMUM(PerplexityScore@all)' +
' and SparsityThetaScore -> max')
# Had to change tracked score function. Is it fine?
decor_phi_cube_template = '''
- RegularizersModifierCube:
num_iter: {{num_iter}}
reg_search: add
regularizer_parameters:
name: decorrelation_phi_{{modality}}
selection:
- {0}
strategy: PerplexityStrategy
strategy_params:
start_point: 0.005
step: 0.005
max_len: 10
tracked_score_function: PerplexityScore{{modality}}
verbose: false
use_relative_coefficients: True
'''.format('PerplexityScore{modality} < ' +
'1.01 * MINIMUM(PerplexityScore{modality})' +
' and SparsityPhiScore{modality} -> max')
smooth_phi_cube_template = '''
- RegularizersModifierCube:
num_iter: {{num_iter}}
reg_search: add
regularizer_parameters:
name: smooth_phi_{{modality}}
selection:
- {0}
strategy: PerplexityStrategy
strategy_params:
start_point: 0.0
step: 0.02
max_len: 20
tracked_score_function: PerplexityScore{{modality}}
verbose: false
use_relative_coefficients: True
'''.format('PerplexityScore{modality} < ' +
'1.01 * MINIMUM(PerplexityScore{modality})' +
' and SparsityPhiScore{modality} -> max')
class MultimodalSearchRecipe(BaseRecipe):
"""
Class for multimodal search recipe creation and
unification of recipe usage interface
"""
def __init__(self, order='extended_modalities'):
"""
Parameters
----------
order : str
can be 'extended_modalities' or 'repeated_default'
where 'repeated_default' repeats the original recipe
for each dataset modality
while 'extended_modalities' extends only modality-reliant
blocks of training keeping last part equivalent to the original pipeline
"""
super().__init__(recipe_template=multimodal_search_template)
self._order = order
def format_recipe(
self,
dataset_path: str,
modality_list: List[str] or Dict = None,
main_modality: str = None,
topic_number: int = 20,
background_topic_number: int = 1,
num_iter: Union[int, List[int]] = 20,
):
'''
Creates a recipe for multimodal search
using basic template at the top of this file
Parameters
----------
dataset_path : path to the data
main_modality : str
chosen to be main modality from modality list, if possible
if it is not specified, the function attempts to user
the first entry of `modality_list` instead
modality_list : list of modality names to use
or a dict specifying the (relative) weight of each
topic_number:
number of the model topics
background_topic_number :
number of background topics
num_iter :
specifying number of iterations for each cube
Returns
-------
string specifying recipe for multimodal search
'''
if modality_list is None:
modality_list = list(Dataset(dataset_path).get_possible_modalities())
specific_topics = [f'topic_{i}' for i in range(topic_number)]
background_topics = [f'bcg_{i}' for i in range(
len(specific_topics), len(specific_topics) + background_topic_number)]
if main_modality is None:
if isinstance(modality_list, list):
main_modality = modality_list[0]
else:
raise TypeError("main_modality should be specified")
self._make_multimodal_recipe(
modality=main_modality,
dataset_path=dataset_path,
specific_topics=specific_topics,
background_topics=background_topics,
modality_list=modality_list,
num_iter=num_iter,
)
return self._recipe
def _form_regularizers(self, modality_list: List[str]):
'''
Creates regularizer configs for each
modality following templates deufined above
Parameters
----------
modality_list : list of str
list with modality names
Returns
-------
string with configs for all needed regularizers
'''
regularizer_templates = []
for modality in modality_list:
regularizer_templates.append(decorrelator_reg_template.format(modality=modality))
regularizer_templates.append(sparse_phi_reg_template.format(modality=modality))
return ''.join(regularizer_templates)
def _form_and_order_cubes(
self,
modality_list: List[str],
num_iter: int = 20,
):
'''
Creates cube configs for each modality
following cube templates defined above
Parameters
----------
modality_list : list of str
list with modality names
num_iter : number or list of numbers
specifying number of iterations for each cube
Returns
-------
string ordering cube templates for recipe
'''
if isinstance(num_iter, int):
num_iter = [num_iter] * (len(modality_list) + 1)
cube_templates = []
for modality, iterations in zip(modality_list, num_iter):
if self._order == 'extended_modalities':
cube_templates.append(decor_phi_cube_template.format(modality=modality,
num_iter=iterations))
cube_templates.append(smooth_phi_cube_template.format(modality=modality,
num_iter=iterations))
elif self._order == 'repeated_default':
cube_templates.append(decor_phi_cube_template.format(modality=modality,
num_iter=iterations))
cube_templates.append(smooth_phi_cube_template.format(modality=modality,
num_iter=iterations))
cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations))
else:
raise ValueError('That option is not availiable')
if self._order == 'extended_modalities':
iterations = num_iter[-1]
cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations))
return ''.join(cube_templates)
def _make_multimodal_recipe(
self,
dataset_path: str,
modality: str,
specific_topics: List[str],
background_topics: List[str],
modality_list: List[str] or Dict = None,
background_topic_number: int = 1,
num_iter: Union[int, List[int]] = 20,
):
reg_forms = self._form_regularizers(modality_list)
cube_forms = self._form_and_order_cubes(
modality_list,
num_iter=num_iter,)
if isinstance(modality_list, list):
modalities_description = f"modalities_to_use: {modality_list}"
elif isinstance(modality_list, dict):
# this line has correct whitespace count
header_string = "modalities_weights:"
# these ones should be indented one level more, so 8 spaces
data_strings = [f"'{k}': {v}" for k, v in modality_list.items()]
strings = [header_string] + data_strings
modalities_description = "\n ".join(strings)
else:
raise TypeError("modality_list should be either list or dict, not {type(modality_list}")
self._recipe = self.recipe_template.format(
modality=modality,
dataset_path=dataset_path,
specific_topics=specific_topics,
background_topics=background_topics,
modalities_description=modalities_description,
syntesized_regularizers=reg_forms,
syntesized_stages=cube_forms)
Classes
class MultimodalSearchRecipe (order='extended_modalities')
-
Class for multimodal search recipe creation and unification of recipe usage interface
Parameters
order
:str
- can be 'extended_modalities' or 'repeated_default' where 'repeated_default' repeats the original recipe for each dataset modality while 'extended_modalities' extends only modality-reliant blocks of training keeping last part equivalent to the original pipeline
Expand source code
class MultimodalSearchRecipe(BaseRecipe): """ Class for multimodal search recipe creation and unification of recipe usage interface """ def __init__(self, order='extended_modalities'): """ Parameters ---------- order : str can be 'extended_modalities' or 'repeated_default' where 'repeated_default' repeats the original recipe for each dataset modality while 'extended_modalities' extends only modality-reliant blocks of training keeping last part equivalent to the original pipeline """ super().__init__(recipe_template=multimodal_search_template) self._order = order def format_recipe( self, dataset_path: str, modality_list: List[str] or Dict = None, main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): ''' Creates a recipe for multimodal search using basic template at the top of this file Parameters ---------- dataset_path : path to the data main_modality : str chosen to be main modality from modality list, if possible if it is not specified, the function attempts to user the first entry of `modality_list` instead modality_list : list of modality names to use or a dict specifying the (relative) weight of each topic_number: number of the model topics background_topic_number : number of background topics num_iter : specifying number of iterations for each cube Returns ------- string specifying recipe for multimodal search ''' if modality_list is None: modality_list = list(Dataset(dataset_path).get_possible_modalities()) specific_topics = [f'topic_{i}' for i in range(topic_number)] background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] if main_modality is None: if isinstance(modality_list, list): main_modality = modality_list[0] else: raise TypeError("main_modality should be specified") self._make_multimodal_recipe( modality=main_modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, modality_list=modality_list, num_iter=num_iter, ) return self._recipe def _form_regularizers(self, modality_list: List[str]): ''' Creates regularizer configs for each modality following templates deufined above Parameters ---------- modality_list : list of str list with modality names Returns ------- string with configs for all needed regularizers ''' regularizer_templates = [] for modality in modality_list: regularizer_templates.append(decorrelator_reg_template.format(modality=modality)) regularizer_templates.append(sparse_phi_reg_template.format(modality=modality)) return ''.join(regularizer_templates) def _form_and_order_cubes( self, modality_list: List[str], num_iter: int = 20, ): ''' Creates cube configs for each modality following cube templates defined above Parameters ---------- modality_list : list of str list with modality names num_iter : number or list of numbers specifying number of iterations for each cube Returns ------- string ordering cube templates for recipe ''' if isinstance(num_iter, int): num_iter = [num_iter] * (len(modality_list) + 1) cube_templates = [] for modality, iterations in zip(modality_list, num_iter): if self._order == 'extended_modalities': cube_templates.append(decor_phi_cube_template.format(modality=modality, num_iter=iterations)) cube_templates.append(smooth_phi_cube_template.format(modality=modality, num_iter=iterations)) elif self._order == 'repeated_default': cube_templates.append(decor_phi_cube_template.format(modality=modality, num_iter=iterations)) cube_templates.append(smooth_phi_cube_template.format(modality=modality, num_iter=iterations)) cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) else: raise ValueError('That option is not availiable') if self._order == 'extended_modalities': iterations = num_iter[-1] cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) return ''.join(cube_templates) def _make_multimodal_recipe( self, dataset_path: str, modality: str, specific_topics: List[str], background_topics: List[str], modality_list: List[str] or Dict = None, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): reg_forms = self._form_regularizers(modality_list) cube_forms = self._form_and_order_cubes( modality_list, num_iter=num_iter,) if isinstance(modality_list, list): modalities_description = f"modalities_to_use: {modality_list}" elif isinstance(modality_list, dict): # this line has correct whitespace count header_string = "modalities_weights:" # these ones should be indented one level more, so 8 spaces data_strings = [f"'{k}': {v}" for k, v in modality_list.items()] strings = [header_string] + data_strings modalities_description = "\n ".join(strings) else: raise TypeError("modality_list should be either list or dict, not {type(modality_list}") self._recipe = self.recipe_template.format( modality=modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, modalities_description=modalities_description, syntesized_regularizers=reg_forms, syntesized_stages=cube_forms)
Ancestors
Methods
def format_recipe(self, dataset_path: str, modality_list: List[str] = None, main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20)
-
Creates a recipe for multimodal search using basic template at the top of this file
Parameters
dataset_path
:path to the data
main_modality
:str
- chosen to be main modality from modality list, if possible
if it is not specified, the function attempts to user
the first entry of
modality_list
instead modality_list
:list
ofmodality names to use
- or a dict specifying the (relative) weight of each
topic_number: number of the model topics background_topic_number : number of background topics num_iter : specifying number of iterations for each cube
Returns
string specifying recipe for multimodal search
Expand source code
def format_recipe( self, dataset_path: str, modality_list: List[str] or Dict = None, main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): ''' Creates a recipe for multimodal search using basic template at the top of this file Parameters ---------- dataset_path : path to the data main_modality : str chosen to be main modality from modality list, if possible if it is not specified, the function attempts to user the first entry of `modality_list` instead modality_list : list of modality names to use or a dict specifying the (relative) weight of each topic_number: number of the model topics background_topic_number : number of background topics num_iter : specifying number of iterations for each cube Returns ------- string specifying recipe for multimodal search ''' if modality_list is None: modality_list = list(Dataset(dataset_path).get_possible_modalities()) specific_topics = [f'topic_{i}' for i in range(topic_number)] background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] if main_modality is None: if isinstance(modality_list, list): main_modality = modality_list[0] else: raise TypeError("main_modality should be specified") self._make_multimodal_recipe( modality=main_modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, modality_list=modality_list, num_iter=num_iter, ) return self._recipe
Inherited members