Module topicnet.viewers.document_cluster

Expand source code
import numpy as np
import colorlover as cl
import plotly.graph_objs as go
import sklearn.manifold as clusterization

from plotly.offline import plot, iplot
from .base_viewer import BaseViewer
from functools import partial


class DocumentClusterViewer(BaseViewer):
    """
    This viewer performs dimesionality reduction over document embeddings
    """
    def __init__(self, model):
        """
        Parameters
        ----------

        model: TopicModel

        """
        super().__init__(model=model)

    def view(
            self,
            dataset,
            save_path,
            method='TSNE',
            to_html=True,
    ):
        """
        Parameters
        ----------
        dataset: Dataset
        save_path: str
            save path for the plot
        method: string
            any of the methods in sklearn.manifold
        to_html: Bool
            if user wants the plot to be saved in html format

        Returns
        -------
        reduced_data: an np.array of (num_docs, dim) dimensions
            reduced dumensions of the original document embeddings
        html_div: string
            an html string containing the plotly graph
            returned only if to_html is True

        """
        from ..cooking_machine.dataset import BaseDataset
        if not isinstance(dataset, BaseDataset):
            raise TypeError('Parameter "dataset" should derive from BaseDataset')

        handler = getattr(clusterization, method,)
        bv = dataset.get_batch_vectorizer()
        model_data = self._model.transform(batch_vectorizer=bv).T

        reduced_data = handler(n_components=2).fit_transform(model_data)
        data_dict = {}
        data_dict['x'] = reduced_data[:, 0]
        data_dict['y'] = reduced_data[:, 1]
        data_dict['label'] = np.argmax(model_data.values, axis=1)
        data_dict['text'] = model_data.index
        base_scheme = cl.scales['12']['qual']['Paired']
        if not to_html:
            drawing_handle = partial(iplot, show_link=False,)
            save_path = None
        else:
            drawing_handle = partial(plot, show_link=False, output_type='div')

        html_div = drawing_handle(
            [go.Scatter(
                x=data_dict['x'],
                y=data_dict['y'],
                mode='markers',
                marker=dict(colorscale=base_scheme,
                            size=4,
                            opacity=0.6,
                            colorbar=dict(title='Topics')),
                marker_color=data_dict['label'],
                text=data_dict['text'],)],
        )
        if save_path is not None:
            with open(save_path, 'w', encoding='utf-8') as f:
                f.write(html_div)

        if to_html:
            return html_div

        return reduced_data

    def viev_from_jupyter(
        self,
        dataset,
        method: str = 'TSNE',
        save_path: str = 'DocumentCluster_view.html',
        width: int = 800,
        height: int = 600,
        display_output: bool = True,
        give_html: bool = False,
    ):
        """
        Parameters
        ----------
        dataset: Dataset
        method: string
            any of the methods in sklearn.manifold
        to_html: Bool
            if user wants the plot to be saved in html format
        save_path: str
            save path for the plot requires to be able to create
            the visualisation
        width
            width of the plot in pixels
        height
            height of the plot in pixels
        display_output
            show the plot in the notebook
        give_html
            if return the html string (with javascript) that
            performs the visualisation

        Returns
        -------
        out_html: string
            an html string containing the plotly graph
            returned only if give_html is True

        """
        from IPython.display import IFrame, display_html
        out_html = self.view(
            dataset=dataset,
            save_path=save_path,
            method=method,
            to_html=True,
        )
        if display_output:
            display_html(IFrame(save_path, width=width, height=height))

        if give_html:
            return out_html

Classes

class DocumentClusterViewer (model)

This viewer performs dimesionality reduction over document embeddings

Parameters

model : TopicModel
 
Expand source code
class DocumentClusterViewer(BaseViewer):
    """
    This viewer performs dimesionality reduction over document embeddings
    """
    def __init__(self, model):
        """
        Parameters
        ----------

        model: TopicModel

        """
        super().__init__(model=model)

    def view(
            self,
            dataset,
            save_path,
            method='TSNE',
            to_html=True,
    ):
        """
        Parameters
        ----------
        dataset: Dataset
        save_path: str
            save path for the plot
        method: string
            any of the methods in sklearn.manifold
        to_html: Bool
            if user wants the plot to be saved in html format

        Returns
        -------
        reduced_data: an np.array of (num_docs, dim) dimensions
            reduced dumensions of the original document embeddings
        html_div: string
            an html string containing the plotly graph
            returned only if to_html is True

        """
        from ..cooking_machine.dataset import BaseDataset
        if not isinstance(dataset, BaseDataset):
            raise TypeError('Parameter "dataset" should derive from BaseDataset')

        handler = getattr(clusterization, method,)
        bv = dataset.get_batch_vectorizer()
        model_data = self._model.transform(batch_vectorizer=bv).T

        reduced_data = handler(n_components=2).fit_transform(model_data)
        data_dict = {}
        data_dict['x'] = reduced_data[:, 0]
        data_dict['y'] = reduced_data[:, 1]
        data_dict['label'] = np.argmax(model_data.values, axis=1)
        data_dict['text'] = model_data.index
        base_scheme = cl.scales['12']['qual']['Paired']
        if not to_html:
            drawing_handle = partial(iplot, show_link=False,)
            save_path = None
        else:
            drawing_handle = partial(plot, show_link=False, output_type='div')

        html_div = drawing_handle(
            [go.Scatter(
                x=data_dict['x'],
                y=data_dict['y'],
                mode='markers',
                marker=dict(colorscale=base_scheme,
                            size=4,
                            opacity=0.6,
                            colorbar=dict(title='Topics')),
                marker_color=data_dict['label'],
                text=data_dict['text'],)],
        )
        if save_path is not None:
            with open(save_path, 'w', encoding='utf-8') as f:
                f.write(html_div)

        if to_html:
            return html_div

        return reduced_data

    def viev_from_jupyter(
        self,
        dataset,
        method: str = 'TSNE',
        save_path: str = 'DocumentCluster_view.html',
        width: int = 800,
        height: int = 600,
        display_output: bool = True,
        give_html: bool = False,
    ):
        """
        Parameters
        ----------
        dataset: Dataset
        method: string
            any of the methods in sklearn.manifold
        to_html: Bool
            if user wants the plot to be saved in html format
        save_path: str
            save path for the plot requires to be able to create
            the visualisation
        width
            width of the plot in pixels
        height
            height of the plot in pixels
        display_output
            show the plot in the notebook
        give_html
            if return the html string (with javascript) that
            performs the visualisation

        Returns
        -------
        out_html: string
            an html string containing the plotly graph
            returned only if give_html is True

        """
        from IPython.display import IFrame, display_html
        out_html = self.view(
            dataset=dataset,
            save_path=save_path,
            method=method,
            to_html=True,
        )
        if display_output:
            display_html(IFrame(save_path, width=width, height=height))

        if give_html:
            return out_html

Ancestors

Methods

def viev_from_jupyter(self, dataset, method: str = 'TSNE', save_path: str = 'DocumentCluster_view.html', width: int = 800, height: int = 600, display_output: bool = True, give_html: bool = False)

Parameters

dataset : Dataset
 
method : string
any of the methods in sklearn.manifold
to_html : Bool
if user wants the plot to be saved in html format
save_path : str
save path for the plot requires to be able to create the visualisation
width
width of the plot in pixels
height
height of the plot in pixels
display_output
show the plot in the notebook
give_html
if return the html string (with javascript) that performs the visualisation

Returns

out_html : string
an html string containing the plotly graph returned only if give_html is True
Expand source code
def viev_from_jupyter(
    self,
    dataset,
    method: str = 'TSNE',
    save_path: str = 'DocumentCluster_view.html',
    width: int = 800,
    height: int = 600,
    display_output: bool = True,
    give_html: bool = False,
):
    """
    Parameters
    ----------
    dataset: Dataset
    method: string
        any of the methods in sklearn.manifold
    to_html: Bool
        if user wants the plot to be saved in html format
    save_path: str
        save path for the plot requires to be able to create
        the visualisation
    width
        width of the plot in pixels
    height
        height of the plot in pixels
    display_output
        show the plot in the notebook
    give_html
        if return the html string (with javascript) that
        performs the visualisation

    Returns
    -------
    out_html: string
        an html string containing the plotly graph
        returned only if give_html is True

    """
    from IPython.display import IFrame, display_html
    out_html = self.view(
        dataset=dataset,
        save_path=save_path,
        method=method,
        to_html=True,
    )
    if display_output:
        display_html(IFrame(save_path, width=width, height=height))

    if give_html:
        return out_html
def view(self, dataset, save_path, method='TSNE', to_html=True)

Parameters

dataset : Dataset
 
save_path : str
save path for the plot
method : string
any of the methods in sklearn.manifold
to_html : Bool
if user wants the plot to be saved in html format

Returns

reduced_data : an np.array of (num_docs, dim) dimensions
reduced dumensions of the original document embeddings
html_div : string
an html string containing the plotly graph returned only if to_html is True
Expand source code
def view(
        self,
        dataset,
        save_path,
        method='TSNE',
        to_html=True,
):
    """
    Parameters
    ----------
    dataset: Dataset
    save_path: str
        save path for the plot
    method: string
        any of the methods in sklearn.manifold
    to_html: Bool
        if user wants the plot to be saved in html format

    Returns
    -------
    reduced_data: an np.array of (num_docs, dim) dimensions
        reduced dumensions of the original document embeddings
    html_div: string
        an html string containing the plotly graph
        returned only if to_html is True

    """
    from ..cooking_machine.dataset import BaseDataset
    if not isinstance(dataset, BaseDataset):
        raise TypeError('Parameter "dataset" should derive from BaseDataset')

    handler = getattr(clusterization, method,)
    bv = dataset.get_batch_vectorizer()
    model_data = self._model.transform(batch_vectorizer=bv).T

    reduced_data = handler(n_components=2).fit_transform(model_data)
    data_dict = {}
    data_dict['x'] = reduced_data[:, 0]
    data_dict['y'] = reduced_data[:, 1]
    data_dict['label'] = np.argmax(model_data.values, axis=1)
    data_dict['text'] = model_data.index
    base_scheme = cl.scales['12']['qual']['Paired']
    if not to_html:
        drawing_handle = partial(iplot, show_link=False,)
        save_path = None
    else:
        drawing_handle = partial(plot, show_link=False, output_type='div')

    html_div = drawing_handle(
        [go.Scatter(
            x=data_dict['x'],
            y=data_dict['y'],
            mode='markers',
            marker=dict(colorscale=base_scheme,
                        size=4,
                        opacity=0.6,
                        colorbar=dict(title='Topics')),
            marker_color=data_dict['label'],
            text=data_dict['text'],)],
    )
    if save_path is not None:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(html_div)

    if to_html:
        return html_div

    return reduced_data