Module topicnet.cooking_machine.models.blei_lafferty_score
Expand source code
import numpy as np
from typing import Callable
from .base_score import BaseScore
class BleiLaffertyScore(BaseScore):
"""
This score implements method described in 2009 paper
Blei, David M., and John D. Lafferty. "Topic models." Text Mining.
Chapman and Hall/CRC, 2009. 101-124.
At the core this score helps to discover tokens that are most likely
to describe given topic. Summing up that score helps to estimate how
well the model distinguishes between topics. The higher this score - better
"""
def __init__(
self,
name: str = None,
num_top_tokens: int = 30,
should_compute: Callable[[int], bool] = None):
"""
Parameters
----------
name:
name of the score
num_top_tokens : int
now many tokens we consider to be
"""
super().__init__(name=name, should_compute=should_compute)
self.num_top_tokens = num_top_tokens
def __repr__(self):
return f'{self.__class__.__name__}(num_top_tokens={self.num_top_tokens})'
def _compute_blei_scores(self, phi):
"""
Computes Blei score
phi[wt] * [log(phi[wt]) - 1/T sum_k log(phi[wk])]
Parameters
----------
phi : pd.Dataframe
phi matrix of the model
Returns
-------
score : pd.Dataframe
wheighted phi matrix
""" # noqa: W291
topic_number = phi.shape[1]
blei_eps = 1e-42
log_phi = np.log(phi + blei_eps)
numerator = np.sum(log_phi, axis=1)
numerator = numerator.to_numpy()[:, np.newaxis]
if hasattr(log_phi, "values"):
multiplier = log_phi.values - numerator / topic_number
else:
multiplier = log_phi - numerator / topic_number
scores = phi * multiplier
return scores
def call(self, model, **kwargs):
modalities = list(model.class_ids.keys())
score = 0
for modality in modalities:
phi = model.get_phi(class_ids=modality)
modality_scores = np.sort(self._compute_blei_scores(phi).values)
score += np.sum(modality_scores[-self.num_top_tokens:, :])
if modalities is None:
phi = model.get_phi()
modality_scores = np.sort(self._compute_blei_scores(phi).values)
score = np.sum(modality_scores[-self.num_top_tokens:, :])
return score
Classes
class BleiLaffertyScore (name: str = None, num_top_tokens: int = 30, should_compute: Callable[[int], bool] = None)
-
This score implements method described in 2009 paper Blei, David M., and John D. Lafferty. "Topic models." Text Mining. Chapman and Hall/CRC, 2009. 101-124. At the core this score helps to discover tokens that are most likely to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better
Parameters
- name:
- name of the score
num_top_tokens
:int
- now many tokens we consider to be
Expand source code
class BleiLaffertyScore(BaseScore): """ This score implements method described in 2009 paper Blei, David M., and John D. Lafferty. "Topic models." Text Mining. Chapman and Hall/CRC, 2009. 101-124. At the core this score helps to discover tokens that are most likely to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better """ def __init__( self, name: str = None, num_top_tokens: int = 30, should_compute: Callable[[int], bool] = None): """ Parameters ---------- name: name of the score num_top_tokens : int now many tokens we consider to be """ super().__init__(name=name, should_compute=should_compute) self.num_top_tokens = num_top_tokens def __repr__(self): return f'{self.__class__.__name__}(num_top_tokens={self.num_top_tokens})' def _compute_blei_scores(self, phi): """ Computes Blei score phi[wt] * [log(phi[wt]) - 1/T sum_k log(phi[wk])] Parameters ---------- phi : pd.Dataframe phi matrix of the model Returns ------- score : pd.Dataframe wheighted phi matrix """ # noqa: W291 topic_number = phi.shape[1] blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) numerator = np.sum(log_phi, axis=1) numerator = numerator.to_numpy()[:, np.newaxis] if hasattr(log_phi, "values"): multiplier = log_phi.values - numerator / topic_number else: multiplier = log_phi - numerator / topic_number scores = phi * multiplier return scores def call(self, model, **kwargs): modalities = list(model.class_ids.keys()) score = 0 for modality in modalities: phi = model.get_phi(class_ids=modality) modality_scores = np.sort(self._compute_blei_scores(phi).values) score += np.sum(modality_scores[-self.num_top_tokens:, :]) if modalities is None: phi = model.get_phi() modality_scores = np.sort(self._compute_blei_scores(phi).values) score = np.sum(modality_scores[-self.num_top_tokens:, :]) return score
Ancestors
Inherited members