Module topicnet.viewers.top_similar_documents_viewer

Expand source code
import numpy as np
import warnings

from collections import defaultdict
from scipy.spatial.distance import cdist as sp_cdist
from typing import Callable

from .base_viewer import BaseViewer
from ..cooking_machine.dataset import BaseDataset


# If change, also modify docstring for view()
METRICS_NAMES = [
    'jensenshannon', 'euclidean', 'cosine', 'correlation'
]


ERROR_DUPLICATE_DOCUMENTS_IDS = """\
Some documents' IDs in dataset are the same: \
number of unique IDs and total number of documents not equal: "{0}" vs. "{1}". \
Need unique IDs in order to identify documents.\
"""

ERROR_TYPE_METRIC = """\
Parameter "metric" should be "str" or "callable". \
The argument given is of type "{0}"\
"""

ERROR_TYPE_NUM_TOP_SIMILAR = """\
Parameter "num_top_similar" should be "int". \
The argument given is of type "{0}"\
"""

ERROR_TYPE_KEEP_SIMILAR_BY_WORDS = """\
Parameter "keep_similar_by_words" should be "bool". \
The argument given is of type "{0}"\
"""

WARNING_UNDEFINED_FREQUENCY_IN_VW = """\
Some words in Vowpal Wabbit text were skipped \
because they didn\'t have frequency after colon sign ":"\
"""

WARNING_FEWER_THAN_REQUESTED = """\
Only "{0}" documents available{1}. \
This is smaller than the requested number of top similar documents "{2}". \
So display is going to contain all "{0}" documents, but sorted by distance\
"""

WARNING_TOO_MANY_REQUESTED = """\
Requested number of top similar documents "{0}" \
is bigger than total number of documents in the dataset "{1}"\
"""


def prepare_doc_html_with_similarity(
    document,
    distance,
    num_digits: int = 3,
    num_sentences_in_snippet: int = 4,
    num_words: int = 15,
):
    """
    Prepares intital document and search results
    html strings

    Parameters
    ----------
    document : Padas.DataFrame row
        a row that contains columns raw_text
        and index in string form
    distance : float between 0 and 1
        measure of how close found document to the
        initial inquiry
    num_digits
        ammount of digits to visualize as document simmilarity
    num_sentences_in_snippet
        how many sentences to use for document snippet
    num_words
        number of document words before the line break in
        the document snippet

    Returns
    -------
    doc_html : str
        an html string with data about document
        plus additional info for the output clarification
    """
    if distance > 0:
        sim = str(1 - distance)[:2 + num_digits]
        doc_title = f'<h3>{document.index.values[0]} &emsp; similarity: {sim}</h3>'
    else:
        doc_title = f'<h3>Search document: &emsp; {document.index.values[0]}</h3>'
    get_sentences = document['raw_text'].values[0].split('. ')[:num_sentences_in_snippet]
    doc_snippet = '. '.join(get_sentences).split(' ')
    doc_snippet[-1] += '.'
    doc_snippet = ' '.join([
        word + '<br />' if i % num_words + 1 == num_words
        else word for i, word in enumerate(doc_snippet)
    ])
    doc_html = f"<h3>{doc_title}</h3>{doc_snippet}<br><br />"
    if distance == 0:
        doc_html += '<h2>Search results:</h2>'
    return doc_html


class TopSimilarDocumentsViewer(BaseViewer):
    def __init__(self, model, dataset):
        """Viewer which uses topic model to find documents similar to given one

        Parameters
        ----------
        model : BaseModel
            Topic model
        dataset : BaseDataset
            Dataset with information about documents
        """
        super().__init__(model=model)

        if not isinstance(dataset, BaseDataset):
            raise TypeError('Parameter "dataset" should derive from BaseDataset')

        self._dataset = dataset
        self._theta = self.model.get_theta(dataset=self._dataset)

        self._documents_ids = list(self._theta.columns)

        if len(self._documents_ids) == 0:
            warnings.warn('No documents in given dataset', UserWarning)
        elif len(set(self._documents_ids)) != len(self._documents_ids):
            raise ValueError(ERROR_DUPLICATE_DOCUMENTS_IDS.format(
                len(set(self._documents_ids)), len(self._documents_ids)))

    def view(self,
             document_id,
             metric='jensenshannon',
             num_top_similar=5,
             keep_similar_by_words=True):
        """Shows documents similar to given one by distribution of topics

        Parameters
        ----------
        document_id
            ID of the document in `dataset`
        metric : str or callable
            Distance measure which is to be used to measure how documents differ from each other
            If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
                as in scipy.spatial.distance.cdist
            If callable -- should map two vectors to numeric value
        num_top_similar : int
            How many top similar documents' IDs to show
        keep_similar_by_words : bool
            Whether or not to keep in the output those documents
            that are similar to the given one by their constituent words and words' frequencies

        Returns
        -------
        tuple(list, list)
            Top similar words, and corresponding distances to given document
        """
        self._check_view_parameters_valid(
            document_id=document_id,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words)

        num_top_similar = min(num_top_similar, len(self._documents_ids))
        document_index = self._documents_ids.index(document_id)

        similar_documents_indices, distances = self._view(
            document_index=document_index,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words)

        documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices]

        return documents_ids, distances

    def view_from_jupyter(
            self,
            document_id: str,
            metric: str or Callable = 'jensenshannon',
            num_top_similar: int = 5,
            num_digits: int = 3,
            keep_similar_by_words: bool = True,
            display_output: bool = True,
            give_html: bool = False,):
        """
        Method for viewing documents similar to requested one
        from jupyter notebook. Provides document titles and snippets of
        first few sentences.

        Parameters
        ----------
        document_id
            ID of the document in `dataset`
        metric
            Distance measure which is to be used to measure how documents differ from each other
            If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
                as in scipy.spatial.distance.cdist
            If callable -- should map two vectors to numeric value
        num_top_similar
            How many top similar documents' IDs to show
        keep_similar_by_words
            Whether or not to keep in the output those documents
            that are similar to the given one by their constituent words and words' frequencies
        display_output
            if provide output at the end of method run
        give_html
            return html string generated by the method

        Returns
        -------
        topic_html
            html string of the generated output
        """
        from IPython.display import display_html
        from topicnet.cooking_machine.pretty_output import make_notebook_pretty

        make_notebook_pretty()
        search_ids, search_distances = self.view(
            document_id=document_id,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words,
        )

        topic_html = ''
        search_ids = [document_id] + search_ids
        search_distances = [0] + search_distances
        for doc_id, distance in zip(search_ids, search_distances):
            document = self._dataset.get_source_document(doc_id)
            topic_html += prepare_doc_html_with_similarity(document, distance)
        if display_output:
            display_html(topic_html, raw=True)

        if give_html:
            return topic_html

    def _view(self,
              document_index,
              metric,
              num_top_similar,
              keep_similar_by_words):

        documents_indices = [i for i, _ in enumerate(self._documents_ids) if i != document_index]
        distances = self._get_documents_distances(documents_indices, document_index, metric)

        documents_indices, distances = \
            TopSimilarDocumentsViewer._sort_elements_by_corresponding_values(
                documents_indices, distances)

        if keep_similar_by_words or len(documents_indices) == 0:
            documents_indices_to_exclude = []
        else:
            documents_indices_to_exclude = \
                self._get_documents_with_similar_words_frequencies_indices(
                    documents_indices, document_index, num_top_similar)

        if len(documents_indices) == len(documents_indices_to_exclude):
            return self._empty_view
        elif len(documents_indices) - len(documents_indices_to_exclude) < num_top_similar:
            warnings.warn(
                WARNING_FEWER_THAN_REQUESTED.format(
                    len(documents_indices_to_exclude),
                    (' after throwing out documents similar just by words'
                     if not keep_similar_by_words else ''),
                    num_top_similar),
                RuntimeWarning
            )

        documents_indices, distances =\
            TopSimilarDocumentsViewer._filter_elements_and_values(
                documents_indices, distances, documents_indices_to_exclude)

        similar_documents_indices = documents_indices[:num_top_similar]
        similar_documents_distances = distances[:num_top_similar]

        return similar_documents_indices, similar_documents_distances

    @staticmethod
    def _sort_elements_by_corresponding_values(elements, values, ascending=True):
        def unzip(zipped):
            # Transforms [(a, A), (b, B), ...] to [a, b, ...], [A, B, ...]
            return list(zip(*zipped))

        elements_values = sorted(zip(elements, values), key=lambda kv: kv[1])

        if not ascending:
            elements_values = elements_values[::-1]

        return unzip(elements_values)

    @staticmethod
    def _filter_elements_and_values(elements, values, elements_to_exclude):
        elements_to_exclude = set(elements_to_exclude)
        indices_to_exclude = set([i for i, e in enumerate(elements) if e in elements_to_exclude])

        result_elements = [e for i, e in enumerate(elements) if i not in indices_to_exclude]
        result_values = [v for i, v in enumerate(values) if i not in indices_to_exclude]

        assert len(result_elements) == len(result_values)

        return result_elements, result_values

    @staticmethod
    def _are_words_frequencies_similar(words_frequencies_a, words_frequencies_b):
        # TODO: method seems very ... heuristic
        # maybe need some research to find the best way to compare words frequencies
        word_frequency_pairs_a = sorted(words_frequencies_a.items(), key=lambda kv: kv[1])
        word_frequency_pairs_b = sorted(words_frequencies_b.items(), key=lambda kv: kv[1])

        num_top_words_to_consider = 100
        jaccard_coefficient = TopSimilarDocumentsViewer._get_jaccard_coefficient(
            word_frequency_pairs_a[:num_top_words_to_consider],
            word_frequency_pairs_b[:num_top_words_to_consider])
        jaccard_coefficient_threshold_to_be_similar = 0.6

        return jaccard_coefficient >= jaccard_coefficient_threshold_to_be_similar

    @staticmethod
    def _get_jaccard_coefficient(word_frequency_pairs_a, word_frequency_pairs_b):
        def get_values_sum(dictionary, default=0.0):
            return sum(dictionary.values() or [default])

        def get_normalized_values(key_value_pairs):
            tiny = 1e-7
            denominator = sum(kv[1] for kv in key_value_pairs) or tiny

            return {k: v / denominator for k, v in key_value_pairs}

        # May help in case documents differ in length significantly
        frequencies_a = get_normalized_values(word_frequency_pairs_a)
        frequencies_b = get_normalized_values(word_frequency_pairs_b)

        words_a, words_b = set(frequencies_a), set(frequencies_b)

        intersection = {
            e: min(frequencies_a[e], frequencies_b[e])
            for e in words_a & words_b
        }
        union = {
            e: max(frequencies_a.get(e, 0), frequencies_b.get(e, 0))
            for e in words_a | words_b
        }

        if len(union) == 0:
            return 0.0

        return get_values_sum(intersection) / get_values_sum(union)

    @staticmethod
    def _extract_words_frequencies(vw_text):
        # Just gather frequencies of words of all modalities
        # TODO: use Dataset for this?

        def is_modality_name(vw_word):
            return vw_word.startswith('|')

        words_frequencies = defaultdict(int)
        has_words_with_undefined_frequencies = False

        for vw_word in vw_text.split():
            if is_modality_name(vw_word):
                continue

            if ':' in vw_word:
                word, frequency = vw_word.split(':')

                if len(frequency) == 0:
                    has_words_with_undefined_frequencies = True
                    continue

                # to allow frequencies as float's but assure that now all are int-s
                frequency = int(round(float(frequency)))
            else:
                word = vw_word
                frequency = 1

            words_frequencies[word] += frequency

        if has_words_with_undefined_frequencies:
            warnings.warn(WARNING_UNDEFINED_FREQUENCY_IN_VW, UserWarning)

        return words_frequencies

    @property
    def _empty_view(self):
        empty_top_similar_documents_list = list()
        empty_distances_list = list()

        return empty_top_similar_documents_list, empty_distances_list

    def _check_view_parameters_valid(
            self, document_id, metric, num_top_similar, keep_similar_by_words):

        if document_id not in self._documents_ids:
            raise ValueError('No document with such id "{}" in dataset'.format(document_id))

        if isinstance(metric, str):
            TopSimilarDocumentsViewer._check_str_metric_valid(metric)
        elif callable(metric):
            TopSimilarDocumentsViewer._check_callable_metric_valid(metric)
        else:
            raise TypeError(ERROR_TYPE_METRIC.format(type(metric)))

        if not isinstance(num_top_similar, int):
            raise TypeError(ERROR_TYPE_NUM_TOP_SIMILAR.format(type(num_top_similar)))
        elif num_top_similar < 0:
            raise ValueError('Parameter "num_top_similar" should be greater than zero')
        elif num_top_similar == 0:
            return self._empty_view
        elif num_top_similar > len(self._documents_ids):
            warnings.warn(
                WARNING_TOO_MANY_REQUESTED.format(
                    num_top_similar, len(self._documents_ids)),
                UserWarning
            )

        if not isinstance(keep_similar_by_words, bool):
            raise TypeError(ERROR_TYPE_KEEP_SIMILAR_BY_WORDS.format(type(keep_similar_by_words)))

    @staticmethod
    def _check_str_metric_valid(metric):
        if metric not in METRICS_NAMES:
            raise ValueError('Unknown metric name "{}", expected one of "{}"'.format(
                metric, ' '.join(METRICS_NAMES)))

    @staticmethod
    def _check_callable_metric_valid(metric):
        try:
            metric(np.array([0]), np.array([0]))
        except TypeError:  # more or less arguments or they are of wrong type for operation
            raise ValueError('Invalid "callable" metric')

    def _get_documents_distances(
            self,
            documents_indices_to_measure_distance_from,
            document_index_to_measure_distance_to,
            metric):

        theta_submatrix = self._theta.iloc[:, documents_indices_to_measure_distance_from]
        documents_vectors = theta_submatrix.T.values

        assert documents_vectors.ndim == 2

        theta_column = self._theta.iloc[:, document_index_to_measure_distance_to]
        document_vector = theta_column.T.values

        assert document_vector.ndim == 1

        document_vector = document_vector.reshape(1, -1)

        assert document_vector.ndim == 2
        assert document_vector.shape[0] == 1
        assert document_vector.shape[1] == documents_vectors.shape[1]

        answer = sp_cdist(documents_vectors, document_vector, metric)

        return answer.flatten()

    def _get_documents_with_similar_words_frequencies_indices(
            self, documents_indices, document_index_to_compare_with,
            num_dissimilar_documents_to_stop_searching):

        # Method is not going to find all similar documents
        # It terminates when enough dissimilar documents are encountered

        similar_documents_indices = []
        num_encountered_dissimilar_documents = 0
        words_frequencies_to_compare_with = \
            self._get_words_frequencies(document_index_to_compare_with)

        for i, doc_index in enumerate(documents_indices):
            if num_encountered_dissimilar_documents == num_dissimilar_documents_to_stop_searching:
                break

            if TopSimilarDocumentsViewer._are_words_frequencies_similar(
                    self._get_words_frequencies(i),
                    words_frequencies_to_compare_with):
                similar_documents_indices.append(doc_index)
            else:
                num_encountered_dissimilar_documents += 1

        return similar_documents_indices

    def _get_words_frequencies(self, document_index):
        vw_text = self._get_vw_text(document_index)

        return TopSimilarDocumentsViewer._extract_words_frequencies(vw_text)

    def _get_vw_text(self, document_index):
        dataset = self._dataset.get_dataset()

        return dataset.iloc[document_index, dataset.columns.get_loc('vw_text')]


def _run_view(viewer, document_id, keep_similar_by_words=True):
    print(
        '> similar_documents, distances = viewer.view('
        'document_id={}{})'.format(
            document_id,
            ', keep_similar_by_word=False' if not keep_similar_by_words else ''))

    similar_documents, distances = viewer.view(
        document_id=document_id, keep_similar_by_words=keep_similar_by_words)

    print('similar_documents:', similar_documents)
    print('distances:', ['{:.4f}'.format(d) for d in distances])
    print()


def _main():
    print('Starting TopSimilarDocumentsViewer\'s view() demonstration!', end='\n\n')

    import artm
    import os

    from cooking_machine.dataset import Dataset
    from cooking_machine.models.topic_model import TopicModel
    from viewers.top_similar_documents_viewer import TopSimilarDocumentsViewer

    current_folder = os.path.dirname(os.path.abspath(__file__))
    dataset = Dataset(os.path.join(current_folder, '../tests/test_data/test_dataset.csv'))

    num_topics = 3
    artm_model = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in range(num_topics)],
        theta_columns_naming='id',
        show_progress_bars=False,
        cache_theta=True)
    artm_model.initialize(dataset.get_dictionary())

    model = TopicModel(artm_model)
    num_iterations = 10
    model._fit(
        dataset_trainable=dataset.get_batch_vectorizer(),
        num_iterations=num_iterations)

    viewer = TopSimilarDocumentsViewer(
        model=model,
        dataset=dataset)

    # One may look if in notebook
    # artm_model.get_theta()
    # dataset.get_dataset()

    print('Documents\' ids:', viewer._documents_ids, end='\n\n')

    _run_view(viewer, document_id="doc_2")
    _run_view(viewer, document_id="doc_5")
    _run_view(viewer, document_id="doc_8")
    _run_view(viewer, document_id="doc_5", keep_similar_by_words=False)


# python -m viewers.top_similar_documents_viewer
if __name__ == '__main__':
    _main()

Functions

def prepare_doc_html_with_similarity(document, distance, num_digits: int = 3, num_sentences_in_snippet: int = 4, num_words: int = 15)

Prepares intital document and search results html strings

Parameters

document : Padas.DataFrame row
a row that contains columns raw_text and index in string form
distance : float between 0 and 1
measure of how close found document to the initial inquiry
num_digits
ammount of digits to visualize as document simmilarity
num_sentences_in_snippet
how many sentences to use for document snippet
num_words
number of document words before the line break in the document snippet

Returns

doc_html : str
an html string with data about document plus additional info for the output clarification
Expand source code
def prepare_doc_html_with_similarity(
    document,
    distance,
    num_digits: int = 3,
    num_sentences_in_snippet: int = 4,
    num_words: int = 15,
):
    """
    Prepares intital document and search results
    html strings

    Parameters
    ----------
    document : Padas.DataFrame row
        a row that contains columns raw_text
        and index in string form
    distance : float between 0 and 1
        measure of how close found document to the
        initial inquiry
    num_digits
        ammount of digits to visualize as document simmilarity
    num_sentences_in_snippet
        how many sentences to use for document snippet
    num_words
        number of document words before the line break in
        the document snippet

    Returns
    -------
    doc_html : str
        an html string with data about document
        plus additional info for the output clarification
    """
    if distance > 0:
        sim = str(1 - distance)[:2 + num_digits]
        doc_title = f'<h3>{document.index.values[0]} &emsp; similarity: {sim}</h3>'
    else:
        doc_title = f'<h3>Search document: &emsp; {document.index.values[0]}</h3>'
    get_sentences = document['raw_text'].values[0].split('. ')[:num_sentences_in_snippet]
    doc_snippet = '. '.join(get_sentences).split(' ')
    doc_snippet[-1] += '.'
    doc_snippet = ' '.join([
        word + '<br />' if i % num_words + 1 == num_words
        else word for i, word in enumerate(doc_snippet)
    ])
    doc_html = f"<h3>{doc_title}</h3>{doc_snippet}<br><br />"
    if distance == 0:
        doc_html += '<h2>Search results:</h2>'
    return doc_html

Classes

class TopSimilarDocumentsViewer (model, dataset)

Viewer which uses topic model to find documents similar to given one

Parameters

model : BaseModel
Topic model
dataset : BaseDataset
Dataset with information about documents
Expand source code
class TopSimilarDocumentsViewer(BaseViewer):
    def __init__(self, model, dataset):
        """Viewer which uses topic model to find documents similar to given one

        Parameters
        ----------
        model : BaseModel
            Topic model
        dataset : BaseDataset
            Dataset with information about documents
        """
        super().__init__(model=model)

        if not isinstance(dataset, BaseDataset):
            raise TypeError('Parameter "dataset" should derive from BaseDataset')

        self._dataset = dataset
        self._theta = self.model.get_theta(dataset=self._dataset)

        self._documents_ids = list(self._theta.columns)

        if len(self._documents_ids) == 0:
            warnings.warn('No documents in given dataset', UserWarning)
        elif len(set(self._documents_ids)) != len(self._documents_ids):
            raise ValueError(ERROR_DUPLICATE_DOCUMENTS_IDS.format(
                len(set(self._documents_ids)), len(self._documents_ids)))

    def view(self,
             document_id,
             metric='jensenshannon',
             num_top_similar=5,
             keep_similar_by_words=True):
        """Shows documents similar to given one by distribution of topics

        Parameters
        ----------
        document_id
            ID of the document in `dataset`
        metric : str or callable
            Distance measure which is to be used to measure how documents differ from each other
            If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
                as in scipy.spatial.distance.cdist
            If callable -- should map two vectors to numeric value
        num_top_similar : int
            How many top similar documents' IDs to show
        keep_similar_by_words : bool
            Whether or not to keep in the output those documents
            that are similar to the given one by their constituent words and words' frequencies

        Returns
        -------
        tuple(list, list)
            Top similar words, and corresponding distances to given document
        """
        self._check_view_parameters_valid(
            document_id=document_id,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words)

        num_top_similar = min(num_top_similar, len(self._documents_ids))
        document_index = self._documents_ids.index(document_id)

        similar_documents_indices, distances = self._view(
            document_index=document_index,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words)

        documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices]

        return documents_ids, distances

    def view_from_jupyter(
            self,
            document_id: str,
            metric: str or Callable = 'jensenshannon',
            num_top_similar: int = 5,
            num_digits: int = 3,
            keep_similar_by_words: bool = True,
            display_output: bool = True,
            give_html: bool = False,):
        """
        Method for viewing documents similar to requested one
        from jupyter notebook. Provides document titles and snippets of
        first few sentences.

        Parameters
        ----------
        document_id
            ID of the document in `dataset`
        metric
            Distance measure which is to be used to measure how documents differ from each other
            If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
                as in scipy.spatial.distance.cdist
            If callable -- should map two vectors to numeric value
        num_top_similar
            How many top similar documents' IDs to show
        keep_similar_by_words
            Whether or not to keep in the output those documents
            that are similar to the given one by their constituent words and words' frequencies
        display_output
            if provide output at the end of method run
        give_html
            return html string generated by the method

        Returns
        -------
        topic_html
            html string of the generated output
        """
        from IPython.display import display_html
        from topicnet.cooking_machine.pretty_output import make_notebook_pretty

        make_notebook_pretty()
        search_ids, search_distances = self.view(
            document_id=document_id,
            metric=metric,
            num_top_similar=num_top_similar,
            keep_similar_by_words=keep_similar_by_words,
        )

        topic_html = ''
        search_ids = [document_id] + search_ids
        search_distances = [0] + search_distances
        for doc_id, distance in zip(search_ids, search_distances):
            document = self._dataset.get_source_document(doc_id)
            topic_html += prepare_doc_html_with_similarity(document, distance)
        if display_output:
            display_html(topic_html, raw=True)

        if give_html:
            return topic_html

    def _view(self,
              document_index,
              metric,
              num_top_similar,
              keep_similar_by_words):

        documents_indices = [i for i, _ in enumerate(self._documents_ids) if i != document_index]
        distances = self._get_documents_distances(documents_indices, document_index, metric)

        documents_indices, distances = \
            TopSimilarDocumentsViewer._sort_elements_by_corresponding_values(
                documents_indices, distances)

        if keep_similar_by_words or len(documents_indices) == 0:
            documents_indices_to_exclude = []
        else:
            documents_indices_to_exclude = \
                self._get_documents_with_similar_words_frequencies_indices(
                    documents_indices, document_index, num_top_similar)

        if len(documents_indices) == len(documents_indices_to_exclude):
            return self._empty_view
        elif len(documents_indices) - len(documents_indices_to_exclude) < num_top_similar:
            warnings.warn(
                WARNING_FEWER_THAN_REQUESTED.format(
                    len(documents_indices_to_exclude),
                    (' after throwing out documents similar just by words'
                     if not keep_similar_by_words else ''),
                    num_top_similar),
                RuntimeWarning
            )

        documents_indices, distances =\
            TopSimilarDocumentsViewer._filter_elements_and_values(
                documents_indices, distances, documents_indices_to_exclude)

        similar_documents_indices = documents_indices[:num_top_similar]
        similar_documents_distances = distances[:num_top_similar]

        return similar_documents_indices, similar_documents_distances

    @staticmethod
    def _sort_elements_by_corresponding_values(elements, values, ascending=True):
        def unzip(zipped):
            # Transforms [(a, A), (b, B), ...] to [a, b, ...], [A, B, ...]
            return list(zip(*zipped))

        elements_values = sorted(zip(elements, values), key=lambda kv: kv[1])

        if not ascending:
            elements_values = elements_values[::-1]

        return unzip(elements_values)

    @staticmethod
    def _filter_elements_and_values(elements, values, elements_to_exclude):
        elements_to_exclude = set(elements_to_exclude)
        indices_to_exclude = set([i for i, e in enumerate(elements) if e in elements_to_exclude])

        result_elements = [e for i, e in enumerate(elements) if i not in indices_to_exclude]
        result_values = [v for i, v in enumerate(values) if i not in indices_to_exclude]

        assert len(result_elements) == len(result_values)

        return result_elements, result_values

    @staticmethod
    def _are_words_frequencies_similar(words_frequencies_a, words_frequencies_b):
        # TODO: method seems very ... heuristic
        # maybe need some research to find the best way to compare words frequencies
        word_frequency_pairs_a = sorted(words_frequencies_a.items(), key=lambda kv: kv[1])
        word_frequency_pairs_b = sorted(words_frequencies_b.items(), key=lambda kv: kv[1])

        num_top_words_to_consider = 100
        jaccard_coefficient = TopSimilarDocumentsViewer._get_jaccard_coefficient(
            word_frequency_pairs_a[:num_top_words_to_consider],
            word_frequency_pairs_b[:num_top_words_to_consider])
        jaccard_coefficient_threshold_to_be_similar = 0.6

        return jaccard_coefficient >= jaccard_coefficient_threshold_to_be_similar

    @staticmethod
    def _get_jaccard_coefficient(word_frequency_pairs_a, word_frequency_pairs_b):
        def get_values_sum(dictionary, default=0.0):
            return sum(dictionary.values() or [default])

        def get_normalized_values(key_value_pairs):
            tiny = 1e-7
            denominator = sum(kv[1] for kv in key_value_pairs) or tiny

            return {k: v / denominator for k, v in key_value_pairs}

        # May help in case documents differ in length significantly
        frequencies_a = get_normalized_values(word_frequency_pairs_a)
        frequencies_b = get_normalized_values(word_frequency_pairs_b)

        words_a, words_b = set(frequencies_a), set(frequencies_b)

        intersection = {
            e: min(frequencies_a[e], frequencies_b[e])
            for e in words_a & words_b
        }
        union = {
            e: max(frequencies_a.get(e, 0), frequencies_b.get(e, 0))
            for e in words_a | words_b
        }

        if len(union) == 0:
            return 0.0

        return get_values_sum(intersection) / get_values_sum(union)

    @staticmethod
    def _extract_words_frequencies(vw_text):
        # Just gather frequencies of words of all modalities
        # TODO: use Dataset for this?

        def is_modality_name(vw_word):
            return vw_word.startswith('|')

        words_frequencies = defaultdict(int)
        has_words_with_undefined_frequencies = False

        for vw_word in vw_text.split():
            if is_modality_name(vw_word):
                continue

            if ':' in vw_word:
                word, frequency = vw_word.split(':')

                if len(frequency) == 0:
                    has_words_with_undefined_frequencies = True
                    continue

                # to allow frequencies as float's but assure that now all are int-s
                frequency = int(round(float(frequency)))
            else:
                word = vw_word
                frequency = 1

            words_frequencies[word] += frequency

        if has_words_with_undefined_frequencies:
            warnings.warn(WARNING_UNDEFINED_FREQUENCY_IN_VW, UserWarning)

        return words_frequencies

    @property
    def _empty_view(self):
        empty_top_similar_documents_list = list()
        empty_distances_list = list()

        return empty_top_similar_documents_list, empty_distances_list

    def _check_view_parameters_valid(
            self, document_id, metric, num_top_similar, keep_similar_by_words):

        if document_id not in self._documents_ids:
            raise ValueError('No document with such id "{}" in dataset'.format(document_id))

        if isinstance(metric, str):
            TopSimilarDocumentsViewer._check_str_metric_valid(metric)
        elif callable(metric):
            TopSimilarDocumentsViewer._check_callable_metric_valid(metric)
        else:
            raise TypeError(ERROR_TYPE_METRIC.format(type(metric)))

        if not isinstance(num_top_similar, int):
            raise TypeError(ERROR_TYPE_NUM_TOP_SIMILAR.format(type(num_top_similar)))
        elif num_top_similar < 0:
            raise ValueError('Parameter "num_top_similar" should be greater than zero')
        elif num_top_similar == 0:
            return self._empty_view
        elif num_top_similar > len(self._documents_ids):
            warnings.warn(
                WARNING_TOO_MANY_REQUESTED.format(
                    num_top_similar, len(self._documents_ids)),
                UserWarning
            )

        if not isinstance(keep_similar_by_words, bool):
            raise TypeError(ERROR_TYPE_KEEP_SIMILAR_BY_WORDS.format(type(keep_similar_by_words)))

    @staticmethod
    def _check_str_metric_valid(metric):
        if metric not in METRICS_NAMES:
            raise ValueError('Unknown metric name "{}", expected one of "{}"'.format(
                metric, ' '.join(METRICS_NAMES)))

    @staticmethod
    def _check_callable_metric_valid(metric):
        try:
            metric(np.array([0]), np.array([0]))
        except TypeError:  # more or less arguments or they are of wrong type for operation
            raise ValueError('Invalid "callable" metric')

    def _get_documents_distances(
            self,
            documents_indices_to_measure_distance_from,
            document_index_to_measure_distance_to,
            metric):

        theta_submatrix = self._theta.iloc[:, documents_indices_to_measure_distance_from]
        documents_vectors = theta_submatrix.T.values

        assert documents_vectors.ndim == 2

        theta_column = self._theta.iloc[:, document_index_to_measure_distance_to]
        document_vector = theta_column.T.values

        assert document_vector.ndim == 1

        document_vector = document_vector.reshape(1, -1)

        assert document_vector.ndim == 2
        assert document_vector.shape[0] == 1
        assert document_vector.shape[1] == documents_vectors.shape[1]

        answer = sp_cdist(documents_vectors, document_vector, metric)

        return answer.flatten()

    def _get_documents_with_similar_words_frequencies_indices(
            self, documents_indices, document_index_to_compare_with,
            num_dissimilar_documents_to_stop_searching):

        # Method is not going to find all similar documents
        # It terminates when enough dissimilar documents are encountered

        similar_documents_indices = []
        num_encountered_dissimilar_documents = 0
        words_frequencies_to_compare_with = \
            self._get_words_frequencies(document_index_to_compare_with)

        for i, doc_index in enumerate(documents_indices):
            if num_encountered_dissimilar_documents == num_dissimilar_documents_to_stop_searching:
                break

            if TopSimilarDocumentsViewer._are_words_frequencies_similar(
                    self._get_words_frequencies(i),
                    words_frequencies_to_compare_with):
                similar_documents_indices.append(doc_index)
            else:
                num_encountered_dissimilar_documents += 1

        return similar_documents_indices

    def _get_words_frequencies(self, document_index):
        vw_text = self._get_vw_text(document_index)

        return TopSimilarDocumentsViewer._extract_words_frequencies(vw_text)

    def _get_vw_text(self, document_index):
        dataset = self._dataset.get_dataset()

        return dataset.iloc[document_index, dataset.columns.get_loc('vw_text')]

Ancestors

Methods

def view(self, document_id, metric='jensenshannon', num_top_similar=5, keep_similar_by_words=True)

Shows documents similar to given one by distribution of topics

Parameters

document_id
ID of the document in dataset
metric : str or callable
Distance measure which is to be used to measure how documents differ from each other If str – should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' – as in scipy.spatial.distance.cdist If callable – should map two vectors to numeric value
num_top_similar : int
How many top similar documents' IDs to show
keep_similar_by_words : bool
Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies

Returns

tuple(list, list)
Top similar words, and corresponding distances to given document
Expand source code
def view(self,
         document_id,
         metric='jensenshannon',
         num_top_similar=5,
         keep_similar_by_words=True):
    """Shows documents similar to given one by distribution of topics

    Parameters
    ----------
    document_id
        ID of the document in `dataset`
    metric : str or callable
        Distance measure which is to be used to measure how documents differ from each other
        If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
            as in scipy.spatial.distance.cdist
        If callable -- should map two vectors to numeric value
    num_top_similar : int
        How many top similar documents' IDs to show
    keep_similar_by_words : bool
        Whether or not to keep in the output those documents
        that are similar to the given one by their constituent words and words' frequencies

    Returns
    -------
    tuple(list, list)
        Top similar words, and corresponding distances to given document
    """
    self._check_view_parameters_valid(
        document_id=document_id,
        metric=metric,
        num_top_similar=num_top_similar,
        keep_similar_by_words=keep_similar_by_words)

    num_top_similar = min(num_top_similar, len(self._documents_ids))
    document_index = self._documents_ids.index(document_id)

    similar_documents_indices, distances = self._view(
        document_index=document_index,
        metric=metric,
        num_top_similar=num_top_similar,
        keep_similar_by_words=keep_similar_by_words)

    documents_ids = [self._documents_ids[doc_index] for doc_index in similar_documents_indices]

    return documents_ids, distances
def view_from_jupyter(self, document_id: str, metric: str = 'jensenshannon', num_top_similar: int = 5, num_digits: int = 3, keep_similar_by_words: bool = True, display_output: bool = True, give_html: bool = False)

Method for viewing documents similar to requested one from jupyter notebook. Provides document titles and snippets of first few sentences.

Parameters

document_id
ID of the document in dataset
metric
Distance measure which is to be used to measure how documents differ from each other If str – should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' – as in scipy.spatial.distance.cdist If callable – should map two vectors to numeric value
num_top_similar
How many top similar documents' IDs to show
keep_similar_by_words
Whether or not to keep in the output those documents that are similar to the given one by their constituent words and words' frequencies
display_output
if provide output at the end of method run
give_html
return html string generated by the method

Returns

topic_html
html string of the generated output
Expand source code
def view_from_jupyter(
        self,
        document_id: str,
        metric: str or Callable = 'jensenshannon',
        num_top_similar: int = 5,
        num_digits: int = 3,
        keep_similar_by_words: bool = True,
        display_output: bool = True,
        give_html: bool = False,):
    """
    Method for viewing documents similar to requested one
    from jupyter notebook. Provides document titles and snippets of
    first few sentences.

    Parameters
    ----------
    document_id
        ID of the document in `dataset`
    metric
        Distance measure which is to be used to measure how documents differ from each other
        If str -- should be one of 'jensenshannon', 'euclidean', 'cosine', 'correlation' --
            as in scipy.spatial.distance.cdist
        If callable -- should map two vectors to numeric value
    num_top_similar
        How many top similar documents' IDs to show
    keep_similar_by_words
        Whether or not to keep in the output those documents
        that are similar to the given one by their constituent words and words' frequencies
    display_output
        if provide output at the end of method run
    give_html
        return html string generated by the method

    Returns
    -------
    topic_html
        html string of the generated output
    """
    from IPython.display import display_html
    from topicnet.cooking_machine.pretty_output import make_notebook_pretty

    make_notebook_pretty()
    search_ids, search_distances = self.view(
        document_id=document_id,
        metric=metric,
        num_top_similar=num_top_similar,
        keep_similar_by_words=keep_similar_by_words,
    )

    topic_html = ''
    search_ids = [document_id] + search_ids
    search_distances = [0] + search_distances
    for doc_id, distance in zip(search_ids, search_distances):
        document = self._dataset.get_source_document(doc_id)
        topic_html += prepare_doc_html_with_similarity(document, distance)
    if display_output:
        display_html(topic_html, raw=True)

    if give_html:
        return topic_html