Module topicnet.viewers.top_similar_documents_viewer
Expand source code
import numpy as np
import warnings
from collections import defaultdict
from scipy.spatial.distance import cdist as sp_cdist
from typing import Callable
from .base_viewer import BaseViewer
from ..cooking_machine.dataset import BaseDataset
# If change, also modify docstring for view()
'jensenshannon', 'euclidean', 'cosine', 'correlation'
Some documents' IDs in dataset are the same: \
number of unique IDs and total number of documents not equal: "{0}" vs. "{1}". \
Need unique IDs in order to identify documents.\
Parameter "metric" should be "str" or "callable". \
The argument given is of type "{0}"\
Parameter "num_top_similar" should be "int". \
The argument given is of type "{0}"\
Parameter "keep_similar_by_words" should be "bool". \
The argument given is of type "{0}"\
Some words in Vowpal Wabbit text were skipped \
because they didn\'t have frequency after colon sign ":"\
Only "{0}" documents available{1}. \
This is smaller than the requested number of top similar documents "{2}". \
So display is going to contain all "{0}" documents, but sorted by distance\
Requested number of top similar documents "{0}" \
is bigger than total number of documents in the dataset "{1}"\
def prepare_doc_html_with_similarity(
num_digits: int = 3,
num_sentences_in_snippet: int = 4,
num_words: int = 15,
Prepares intital document and search results
html strings
document : Padas.DataFrame row
a row that contains columns raw_text
and index in string form
distance : float between 0 and 1
measure of how close found document to the
initial inquiry
ammount of digits to visualize as document simmilarity
how many sentences to use for document snippet
number of document words before the line break in
the document snippet
doc_html : str
an html string with data about document
plus additional info for the output clarification
if distance > 0:
sim = str(1 - distance)[:2 + num_digits]
doc_title = f'<h3>{document.index.values[0]}   similarity: {sim}</h3>'
doc_title = f'<h3>Search document:   {document.index.values[0]}</h3>'
get_sentences = document['raw_text'].values[0].split('. ')[:num_sentences_in_snippet]
doc_snippet = '. '.join(get_sentences).split(' ')
doc_snippet[-1] += '.'
doc_snippet = ' '.join([
word + '<br />' if i % num_words + 1 == num_words
else word for i, word in enumerate(doc_snippet)
doc_html = f"<h3>{doc_title}</h3>{doc_snippet}<br><br />"
if distance == 0:
doc_html += '<h2>Search results:</h2>'
return doc_html
class TopSimilarDocumentsViewer(BaseViewer):
def __init__(self, model, dataset):
"""Viewer which uses topic model to find documents similar to given one
model : BaseModel
Topic model
dataset : BaseDataset
Dataset with information about documents
if not isinstance(dataset, BaseDataset):
raise TypeError('Parameter "dataset" should derive from BaseDataset')
self._dataset = dataset
self._theta = self.model.get_theta(dataset=self._dataset)
self._documents_ids = list(self._theta.columns)
if len(self._documents_ids) == 0:
warnings.warn('No documents in given dataset', UserWarning)
elif len(set(self._documents_ids)) != len(self._documents_ids):
len(set(self._documents_ids)), len(self._documents_ids)))
def view(self,
"""Shows documents similar to given one by distribution of topics
ID of the document in `dataset`
metric : str or callable
Distance measure which is to be used to measure how documents differ from each other
If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
as in scipy.spatial.distance.cdist
If callable -- should map two vectors to numeric value
num_top_similar : int
How many top similar documents' IDs to show
keep_similar_by_words : bool
Whether or not to keep in the output those documents
that are similar to the given one by their constituent words and words' frequencies
tuple(list, list)
Top similar words, and corresponding distances to given document
num_top_similar = min(num_top_similar, len(self._documents_ids))
document_index = self._documents_ids.index(document_id)
similar_documents_indices, distances = self._view(
documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices]
return documents_ids, distances
def view_from_jupyter(
document_id: str,
metric: str or Callable = 'jensenshannon',
num_top_similar: int = 5,
num_digits: int = 3,
keep_similar_by_words: bool = True,
display_output: bool = True,
give_html: bool = False,):
Method for viewing documents similar to requested one
from jupyter notebook. Provides document titles and snippets of
first few sentences.
ID of the document in `dataset`
Distance measure which is to be used to measure how documents differ from each other
If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
as in scipy.spatial.distance.cdist
If callable -- should map two vectors to numeric value
How many top similar documents' IDs to show
Whether or not to keep in the output those documents
that are similar to the given one by their constituent words and words' frequencies
if provide output at the end of method run
return html string generated by the method
html string of the generated output
from IPython.display import display_html
from topicnet.cooking_machine.pretty_output import make_notebook_pretty
search_ids, search_distances = self.view(
topic_html = ''
search_ids = [document_id] + search_ids
search_distances = [0] + search_distances
for doc_id, distance in zip(search_ids, search_distances):
document = self._dataset.get_source_document(doc_id)
topic_html += prepare_doc_html_with_similarity(document, distance)
if display_output:
display_html(topic_html, raw=True)
if give_html:
return topic_html
def _view(self,
documents_indices = [i for i, _ in enumerate(self._documents_ids) if i != document_index]
distances = self._get_documents_distances(documents_indices, document_index, metric)
documents_indices, distances = \
documents_indices, distances)
if keep_similar_by_words or len(documents_indices) == 0:
documents_indices_to_exclude = []
documents_indices_to_exclude = \
documents_indices, document_index, num_top_similar)
if len(documents_indices) == len(documents_indices_to_exclude):
return self._empty_view
elif len(documents_indices) - len(documents_indices_to_exclude) < num_top_similar:
(' after throwing out documents similar just by words'
if not keep_similar_by_words else ''),
documents_indices, distances =\
documents_indices, distances, documents_indices_to_exclude)
similar_documents_indices = documents_indices[:num_top_similar]
similar_documents_distances = distances[:num_top_similar]
return similar_documents_indices, similar_documents_distances
def _sort_elements_by_corresponding_values(elements, values, ascending=True):
def unzip(zipped):
# Transforms [(a, A), (b, B), ...] to [a, b, ...], [A, B, ...]
return list(zip(*zipped))
elements_values = sorted(zip(elements, values), key=lambda kv: kv[1])
if not ascending:
elements_values = elements_values[::-1]
return unzip(elements_values)
def _filter_elements_and_values(elements, values, elements_to_exclude):
elements_to_exclude = set(elements_to_exclude)
indices_to_exclude = set([i for i, e in enumerate(elements) if e in elements_to_exclude])
result_elements = [e for i, e in enumerate(elements) if i not in indices_to_exclude]
result_values = [v for i, v in enumerate(values) if i not in indices_to_exclude]
assert len(result_elements) == len(result_values)
return result_elements, result_values
def _are_words_frequencies_similar(words_frequencies_a, words_frequencies_b):
# TODO: method seems very ... heuristic
# maybe need some research to find the best way to compare words frequencies
word_frequency_pairs_a = sorted(words_frequencies_a.items(), key=lambda kv: kv[1])
word_frequency_pairs_b = sorted(words_frequencies_b.items(), key=lambda kv: kv[1])
num_top_words_to_consider = 100
jaccard_coefficient = TopSimilarDocumentsViewer._get_jaccard_coefficient(
jaccard_coefficient_threshold_to_be_similar = 0.6
return jaccard_coefficient >= jaccard_coefficient_threshold_to_be_similar
def _get_jaccard_coefficient(word_frequency_pairs_a, word_frequency_pairs_b):
def get_values_sum(dictionary, default=0.0):
return sum(dictionary.values() or [default])
def get_normalized_values(key_value_pairs):
tiny = 1e-7
denominator = sum(kv[1] for kv in key_value_pairs) or tiny
return {k: v / denominator for k, v in key_value_pairs}
# May help in case documents differ in length significantly
frequencies_a = get_normalized_values(word_frequency_pairs_a)
frequencies_b = get_normalized_values(word_frequency_pairs_b)
words_a, words_b = set(frequencies_a), set(frequencies_b)
intersection = {
e: min(frequencies_a[e], frequencies_b[e])
for e in words_a & words_b
union = {
e: max(frequencies_a.get(e, 0), frequencies_b.get(e, 0))
for e in words_a | words_b
if len(union) == 0:
return 0.0
return get_values_sum(intersection) / get_values_sum(union)
def _extract_words_frequencies(vw_text):
# Just gather frequencies of words of all modalities
# TODO: use Dataset for this?
def is_modality_name(vw_word):
return vw_word.startswith('|')
words_frequencies = defaultdict(int)
has_words_with_undefined_frequencies = False
for vw_word in vw_text.split():
if is_modality_name(vw_word):
if ':' in vw_word:
word, frequency = vw_word.split(':')
if len(frequency) == 0:
has_words_with_undefined_frequencies = True
# to allow frequencies as float's but assure that now all are int-s
frequency = int(round(float(frequency)))
word = vw_word
frequency = 1
words_frequencies[word] += frequency
if has_words_with_undefined_frequencies:
return words_frequencies
def _empty_view(self):
empty_top_similar_documents_list = list()
empty_distances_list = list()
return empty_top_similar_documents_list, empty_distances_list
def _check_view_parameters_valid(
self, document_id, metric, num_top_similar, keep_similar_by_words):
if document_id not in self._documents_ids:
raise ValueError('No document with such id "{}" in dataset'.format(document_id))
if isinstance(metric, str):
elif callable(metric):
raise TypeError(ERROR_TYPE_METRIC.format(type(metric)))
if not isinstance(num_top_similar, int):
raise TypeError(ERROR_TYPE_NUM_TOP_SIMILAR.format(type(num_top_similar)))
elif num_top_similar < 0:
raise ValueError('Parameter "num_top_similar" should be greater than zero')
elif num_top_similar == 0:
return self._empty_view
elif num_top_similar > len(self._documents_ids):
num_top_similar, len(self._documents_ids)),
if not isinstance(keep_similar_by_words, bool):
raise TypeError(ERROR_TYPE_KEEP_SIMILAR_BY_WORDS.format(type(keep_similar_by_words)))
def _check_str_metric_valid(metric):
if metric not in METRICS_NAMES:
raise ValueError('Unknown metric name "{}", expected one of "{}"'.format(
metric, ' '.join(METRICS_NAMES)))
def _check_callable_metric_valid(metric):
metric(np.array([0]), np.array([0]))
except TypeError: # more or less arguments or they are of wrong type for operation
raise ValueError('Invalid "callable" metric')
def _get_documents_distances(
theta_submatrix = self._theta.iloc[:, documents_indices_to_measure_distance_from]
documents_vectors = theta_submatrix.T.values
assert documents_vectors.ndim == 2
theta_column = self._theta.iloc[:, document_index_to_measure_distance_to]
document_vector = theta_column.T.values
assert document_vector.ndim == 1
document_vector = document_vector.reshape(1, -1)
assert document_vector.ndim == 2
assert document_vector.shape[0] == 1
assert document_vector.shape[1] == documents_vectors.shape[1]
answer = sp_cdist(documents_vectors, document_vector, metric)
return answer.flatten()
def _get_documents_with_similar_words_frequencies_indices(
self, documents_indices, document_index_to_compare_with,
# Method is not going to find all similar documents
# It terminates when enough dissimilar documents are encountered
similar_documents_indices = []
num_encountered_dissimilar_documents = 0
words_frequencies_to_compare_with = \
for i, doc_index in enumerate(documents_indices):
if num_encountered_dissimilar_documents == num_dissimilar_documents_to_stop_searching:
if TopSimilarDocumentsViewer._are_words_frequencies_similar(
num_encountered_dissimilar_documents += 1
return similar_documents_indices
def _get_words_frequencies(self, document_index):
vw_text = self._get_vw_text(document_index)
return TopSimilarDocumentsViewer._extract_words_frequencies(vw_text)
def _get_vw_text(self, document_index):
dataset = self._dataset.get_dataset()
return dataset.iloc[document_index, dataset.columns.get_loc('vw_text')]
def _run_view(viewer, document_id, keep_similar_by_words=True):
'> similar_documents, distances = viewer.view('
', keep_similar_by_word=False' if not keep_similar_by_words else ''))
similar_documents, distances = viewer.view(
document_id=document_id, keep_similar_by_words=keep_similar_by_words)
print('similar_documents:', similar_documents)
print('distances:', ['{:.4f}'.format(d) for d in distances])
def _main():
print('Starting TopSimilarDocumentsViewer\'s view() demonstration!', end='\n\n')
import artm
import os
from cooking_machine.dataset import Dataset
from cooking_machine.models.topic_model import TopicModel
from viewers.top_similar_documents_viewer import TopSimilarDocumentsViewer
current_folder = os.path.dirname(os.path.abspath(__file__))
dataset = Dataset(os.path.join(current_folder, '../tests/test_data/test_dataset.csv'))
num_topics = 3
artm_model = artm.ARTM(
topic_names=['topic_{}'.format(i) for i in range(num_topics)],
model = TopicModel(artm_model)
num_iterations = 10
viewer = TopSimilarDocumentsViewer(
# One may look if in notebook
# artm_model.get_theta()
# dataset.get_dataset()
print('Documents\' ids:', viewer._documents_ids, end='\n\n')
_run_view(viewer, document_id="doc_2")
_run_view(viewer, document_id="doc_5")
_run_view(viewer, document_id="doc_8")
_run_view(viewer, document_id="doc_5", keep_similar_by_words=False)
# python -m viewers.top_similar_documents_viewer
if __name__ == '__main__':
def prepare_doc_html_with_similarity(document, distance, num_digits: int = 3, num_sentences_in_snippet: int = 4, num_words: int = 15)
Prepares intital document and search results html strings
:Padas.DataFrame row
- a row that contains columns raw_text and index in string form
:float between 0 and 1
- measure of how close found document to the initial inquiry
- ammount of digits to visualize as document simmilarity
- how many sentences to use for document snippet
- number of document words before the line break in the document snippet
- an html string with data about document plus additional info for the output clarification
Expand source code
def prepare_doc_html_with_similarity( document, distance, num_digits: int = 3, num_sentences_in_snippet: int = 4, num_words: int = 15, ): """ Prepares intital document and search results html strings Parameters ---------- document : Padas.DataFrame row a row that contains columns raw_text and index in string form distance : float between 0 and 1 measure of how close found document to the initial inquiry num_digits ammount of digits to visualize as document simmilarity num_sentences_in_snippet how many sentences to use for document snippet num_words number of document words before the line break in the document snippet Returns ------- doc_html : str an html string with data about document plus additional info for the output clarification """ if distance > 0: sim = str(1 - distance)[:2 + num_digits] doc_title = f'<h3>{document.index.values[0]}   similarity: {sim}</h3>' else: doc_title = f'<h3>Search document:   {document.index.values[0]}</h3>' get_sentences = document['raw_text'].values[0].split('. ')[:num_sentences_in_snippet] doc_snippet = '. '.join(get_sentences).split(' ') doc_snippet[-1] += '.' doc_snippet = ' '.join([ word + '<br />' if i % num_words + 1 == num_words else word for i, word in enumerate(doc_snippet) ]) doc_html = f"<h3>{doc_title}</h3>{doc_snippet}<br><br />" if distance == 0: doc_html += '<h2>Search results:</h2>' return doc_html
class TopSimilarDocumentsViewer (model, dataset)
Viewer which uses topic model to find documents similar to given one
- Topic model
- Dataset with information about documents
Expand source code
class TopSimilarDocumentsViewer(BaseViewer): def __init__(self, model, dataset): """Viewer which uses topic model to find documents similar to given one Parameters ---------- model : BaseModel Topic model dataset : BaseDataset Dataset with information about documents """ super().__init__(model=model) if not isinstance(dataset, BaseDataset): raise TypeError('Parameter "dataset" should derive from BaseDataset') self._dataset = dataset self._theta = self.model.get_theta(dataset=self._dataset) self._documents_ids = list(self._theta.columns) if len(self._documents_ids) == 0: warnings.warn('No documents in given dataset', UserWarning) elif len(set(self._documents_ids)) != len(self._documents_ids): raise ValueError(ERROR_DUPLICATE_DOCUMENTS_IDS.format( len(set(self._documents_ids)), len(self._documents_ids))) def view(self, document_id, metric='jensenshannon', num_top_similar=5, keep_similar_by_words=True): """Shows documents similar to given one by distribution of topics Parameters ---------- document_id ID of the document in `dataset` metric : str or callable Distance measure which is to be used to measure how documents differ from each other If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' -- as in scipy.spatial.distance.cdist If callable -- should map two vectors to numeric value num_top_similar : int How many top similar documents' IDs to show keep_similar_by_words : bool Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies Returns ------- tuple(list, list) Top similar words, and corresponding distances to given document """ self._check_view_parameters_valid( document_id=document_id, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words) num_top_similar = min(num_top_similar, len(self._documents_ids)) document_index = self._documents_ids.index(document_id) similar_documents_indices, distances = self._view( document_index=document_index, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words) documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices] return documents_ids, distances def view_from_jupyter( self, document_id: str, metric: str or Callable = 'jensenshannon', num_top_similar: int = 5, num_digits: int = 3, keep_similar_by_words: bool = True, display_output: bool = True, give_html: bool = False,): """ Method for viewing documents similar to requested one from jupyter notebook. Provides document titles and snippets of first few sentences. Parameters ---------- document_id ID of the document in `dataset` metric Distance measure which is to be used to measure how documents differ from each other If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' -- as in scipy.spatial.distance.cdist If callable -- should map two vectors to numeric value num_top_similar How many top similar documents' IDs to show keep_similar_by_words Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies display_output if provide output at the end of method run give_html return html string generated by the method Returns ------- topic_html html string of the generated output """ from IPython.display import display_html from topicnet.cooking_machine.pretty_output import make_notebook_pretty make_notebook_pretty() search_ids, search_distances = self.view( document_id=document_id, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words, ) topic_html = '' search_ids = [document_id] + search_ids search_distances = [0] + search_distances for doc_id, distance in zip(search_ids, search_distances): document = self._dataset.get_source_document(doc_id) topic_html += prepare_doc_html_with_similarity(document, distance) if display_output: display_html(topic_html, raw=True) if give_html: return topic_html def _view(self, document_index, metric, num_top_similar, keep_similar_by_words): documents_indices = [i for i, _ in enumerate(self._documents_ids) if i != document_index] distances = self._get_documents_distances(documents_indices, document_index, metric) documents_indices, distances = \ TopSimilarDocumentsViewer._sort_elements_by_corresponding_values( documents_indices, distances) if keep_similar_by_words or len(documents_indices) == 0: documents_indices_to_exclude = [] else: documents_indices_to_exclude = \ self._get_documents_with_similar_words_frequencies_indices( documents_indices, document_index, num_top_similar) if len(documents_indices) == len(documents_indices_to_exclude): return self._empty_view elif len(documents_indices) - len(documents_indices_to_exclude) < num_top_similar: warnings.warn( WARNING_FEWER_THAN_REQUESTED.format( len(documents_indices_to_exclude), (' after throwing out documents similar just by words' if not keep_similar_by_words else ''), num_top_similar), RuntimeWarning ) documents_indices, distances =\ TopSimilarDocumentsViewer._filter_elements_and_values( documents_indices, distances, documents_indices_to_exclude) similar_documents_indices = documents_indices[:num_top_similar] similar_documents_distances = distances[:num_top_similar] return similar_documents_indices, similar_documents_distances @staticmethod def _sort_elements_by_corresponding_values(elements, values, ascending=True): def unzip(zipped): # Transforms [(a, A), (b, B), ...] to [a, b, ...], [A, B, ...] return list(zip(*zipped)) elements_values = sorted(zip(elements, values), key=lambda kv: kv[1]) if not ascending: elements_values = elements_values[::-1] return unzip(elements_values) @staticmethod def _filter_elements_and_values(elements, values, elements_to_exclude): elements_to_exclude = set(elements_to_exclude) indices_to_exclude = set([i for i, e in enumerate(elements) if e in elements_to_exclude]) result_elements = [e for i, e in enumerate(elements) if i not in indices_to_exclude] result_values = [v for i, v in enumerate(values) if i not in indices_to_exclude] assert len(result_elements) == len(result_values) return result_elements, result_values @staticmethod def _are_words_frequencies_similar(words_frequencies_a, words_frequencies_b): # TODO: method seems very ... heuristic # maybe need some research to find the best way to compare words frequencies word_frequency_pairs_a = sorted(words_frequencies_a.items(), key=lambda kv: kv[1]) word_frequency_pairs_b = sorted(words_frequencies_b.items(), key=lambda kv: kv[1]) num_top_words_to_consider = 100 jaccard_coefficient = TopSimilarDocumentsViewer._get_jaccard_coefficient( word_frequency_pairs_a[:num_top_words_to_consider], word_frequency_pairs_b[:num_top_words_to_consider]) jaccard_coefficient_threshold_to_be_similar = 0.6 return jaccard_coefficient >= jaccard_coefficient_threshold_to_be_similar @staticmethod def _get_jaccard_coefficient(word_frequency_pairs_a, word_frequency_pairs_b): def get_values_sum(dictionary, default=0.0): return sum(dictionary.values() or [default]) def get_normalized_values(key_value_pairs): tiny = 1e-7 denominator = sum(kv[1] for kv in key_value_pairs) or tiny return {k: v / denominator for k, v in key_value_pairs} # May help in case documents differ in length significantly frequencies_a = get_normalized_values(word_frequency_pairs_a) frequencies_b = get_normalized_values(word_frequency_pairs_b) words_a, words_b = set(frequencies_a), set(frequencies_b) intersection = { e: min(frequencies_a[e], frequencies_b[e]) for e in words_a & words_b } union = { e: max(frequencies_a.get(e, 0), frequencies_b.get(e, 0)) for e in words_a | words_b } if len(union) == 0: return 0.0 return get_values_sum(intersection) / get_values_sum(union) @staticmethod def _extract_words_frequencies(vw_text): # Just gather frequencies of words of all modalities # TODO: use Dataset for this? def is_modality_name(vw_word): return vw_word.startswith('|') words_frequencies = defaultdict(int) has_words_with_undefined_frequencies = False for vw_word in vw_text.split(): if is_modality_name(vw_word): continue if ':' in vw_word: word, frequency = vw_word.split(':') if len(frequency) == 0: has_words_with_undefined_frequencies = True continue # to allow frequencies as float's but assure that now all are int-s frequency = int(round(float(frequency))) else: word = vw_word frequency = 1 words_frequencies[word] += frequency if has_words_with_undefined_frequencies: warnings.warn(WARNING_UNDEFINED_FREQUENCY_IN_VW, UserWarning) return words_frequencies @property def _empty_view(self): empty_top_similar_documents_list = list() empty_distances_list = list() return empty_top_similar_documents_list, empty_distances_list def _check_view_parameters_valid( self, document_id, metric, num_top_similar, keep_similar_by_words): if document_id not in self._documents_ids: raise ValueError('No document with such id "{}" in dataset'.format(document_id)) if isinstance(metric, str): TopSimilarDocumentsViewer._check_str_metric_valid(metric) elif callable(metric): TopSimilarDocumentsViewer._check_callable_metric_valid(metric) else: raise TypeError(ERROR_TYPE_METRIC.format(type(metric))) if not isinstance(num_top_similar, int): raise TypeError(ERROR_TYPE_NUM_TOP_SIMILAR.format(type(num_top_similar))) elif num_top_similar < 0: raise ValueError('Parameter "num_top_similar" should be greater than zero') elif num_top_similar == 0: return self._empty_view elif num_top_similar > len(self._documents_ids): warnings.warn( WARNING_TOO_MANY_REQUESTED.format( num_top_similar, len(self._documents_ids)), UserWarning ) if not isinstance(keep_similar_by_words, bool): raise TypeError(ERROR_TYPE_KEEP_SIMILAR_BY_WORDS.format(type(keep_similar_by_words))) @staticmethod def _check_str_metric_valid(metric): if metric not in METRICS_NAMES: raise ValueError('Unknown metric name "{}", expected one of "{}"'.format( metric, ' '.join(METRICS_NAMES))) @staticmethod def _check_callable_metric_valid(metric): try: metric(np.array([0]), np.array([0])) except TypeError: # more or less arguments or they are of wrong type for operation raise ValueError('Invalid "callable" metric') def _get_documents_distances( self, documents_indices_to_measure_distance_from, document_index_to_measure_distance_to, metric): theta_submatrix = self._theta.iloc[:, documents_indices_to_measure_distance_from] documents_vectors = theta_submatrix.T.values assert documents_vectors.ndim == 2 theta_column = self._theta.iloc[:, document_index_to_measure_distance_to] document_vector = theta_column.T.values assert document_vector.ndim == 1 document_vector = document_vector.reshape(1, -1) assert document_vector.ndim == 2 assert document_vector.shape[0] == 1 assert document_vector.shape[1] == documents_vectors.shape[1] answer = sp_cdist(documents_vectors, document_vector, metric) return answer.flatten() def _get_documents_with_similar_words_frequencies_indices( self, documents_indices, document_index_to_compare_with, num_dissimilar_documents_to_stop_searching): # Method is not going to find all similar documents # It terminates when enough dissimilar documents are encountered similar_documents_indices = [] num_encountered_dissimilar_documents = 0 words_frequencies_to_compare_with = \ self._get_words_frequencies(document_index_to_compare_with) for i, doc_index in enumerate(documents_indices): if num_encountered_dissimilar_documents == num_dissimilar_documents_to_stop_searching: break if TopSimilarDocumentsViewer._are_words_frequencies_similar( self._get_words_frequencies(i), words_frequencies_to_compare_with): similar_documents_indices.append(doc_index) else: num_encountered_dissimilar_documents += 1 return similar_documents_indices def _get_words_frequencies(self, document_index): vw_text = self._get_vw_text(document_index) return TopSimilarDocumentsViewer._extract_words_frequencies(vw_text) def _get_vw_text(self, document_index): dataset = self._dataset.get_dataset() return dataset.iloc[document_index, dataset.columns.get_loc('vw_text')]
def view(self, document_id, metric='jensenshannon', num_top_similar=5, keep_similar_by_words=True)
Shows documents similar to given one by distribution of topics
- ID of the document in
- Distance measure which is to be used to measure how documents differ from each other If str – should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' – as in scipy.spatial.distance.cdist If callable – should map two vectors to numeric value
- How many top similar documents' IDs to show
- Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies
tuple(list, list)
- Top similar words, and corresponding distances to given document
Expand source code
def view(self, document_id, metric='jensenshannon', num_top_similar=5, keep_similar_by_words=True): """Shows documents similar to given one by distribution of topics Parameters ---------- document_id ID of the document in `dataset` metric : str or callable Distance measure which is to be used to measure how documents differ from each other If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' -- as in scipy.spatial.distance.cdist If callable -- should map two vectors to numeric value num_top_similar : int How many top similar documents' IDs to show keep_similar_by_words : bool Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies Returns ------- tuple(list, list) Top similar words, and corresponding distances to given document """ self._check_view_parameters_valid( document_id=document_id, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words) num_top_similar = min(num_top_similar, len(self._documents_ids)) document_index = self._documents_ids.index(document_id) similar_documents_indices, distances = self._view( document_index=document_index, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words) documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices] return documents_ids, distances
def view_from_jupyter(self, document_id: str, metric: str = 'jensenshannon', num_top_similar: int = 5, num_digits: int = 3, keep_similar_by_words: bool = True, display_output: bool = True, give_html: bool = False)
Method for viewing documents similar to requested one from jupyter notebook. Provides document titles and snippets of first few sentences.
- ID of the document in
- Distance measure which is to be used to measure how documents differ from each other If str – should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' – as in scipy.spatial.distance.cdist If callable – should map two vectors to numeric value
- How many top similar documents' IDs to show
- Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies
- if provide output at the end of method run
- return html string generated by the method
- html string of the generated output
Expand source code
def view_from_jupyter( self, document_id: str, metric: str or Callable = 'jensenshannon', num_top_similar: int = 5, num_digits: int = 3, keep_similar_by_words: bool = True, display_output: bool = True, give_html: bool = False,): """ Method for viewing documents similar to requested one from jupyter notebook. Provides document titles and snippets of first few sentences. Parameters ---------- document_id ID of the document in `dataset` metric Distance measure which is to be used to measure how documents differ from each other If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' -- as in scipy.spatial.distance.cdist If callable -- should map two vectors to numeric value num_top_similar How many top similar documents' IDs to show keep_similar_by_words Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies display_output if provide output at the end of method run give_html return html string generated by the method Returns ------- topic_html html string of the generated output """ from IPython.display import display_html from topicnet.cooking_machine.pretty_output import make_notebook_pretty make_notebook_pretty() search_ids, search_distances = self.view( document_id=document_id, metric=metric, num_top_similar=num_top_similar, keep_similar_by_words=keep_similar_by_words, ) topic_html = '' search_ids = [document_id] + search_ids search_distances = [0] + search_distances for doc_id, distance in zip(search_ids, search_distances): document = self._dataset.get_source_document(doc_id) topic_html += prepare_doc_html_with_similarity(document, distance) if display_output: display_html(topic_html, raw=True) if give_html: return topic_html