Module topicnet.viewers.document_cluster
Expand source code
import numpy as np
import colorlover as cl
import plotly.graph_objs as go
import sklearn.manifold as clusterization
from plotly.offline import plot, iplot
from .base_viewer import BaseViewer
from functools import partial
class DocumentClusterViewer(BaseViewer):
"""
This viewer performs dimesionality reduction over document embeddings
"""
def __init__(self, model):
"""
Parameters
----------
model: TopicModel
"""
super().__init__(model=model)
def view(
self,
dataset,
save_path,
method='TSNE',
to_html=True,
):
"""
Parameters
----------
dataset: Dataset
save_path: str
save path for the plot
method: string
any of the methods in sklearn.manifold
to_html: Bool
if user wants the plot to be saved in html format
Returns
-------
reduced_data: an np.array of (num_docs, dim) dimensions
reduced dumensions of the original document embeddings
html_div: string
an html string containing the plotly graph
returned only if to_html is True
"""
from ..cooking_machine.dataset import BaseDataset
if not isinstance(dataset, BaseDataset):
raise TypeError('Parameter "dataset" should derive from BaseDataset')
handler = getattr(clusterization, method,)
bv = dataset.get_batch_vectorizer()
model_data = self._model.transform(batch_vectorizer=bv).T
reduced_data = handler(n_components=2).fit_transform(model_data)
data_dict = {}
data_dict['x'] = reduced_data[:, 0]
data_dict['y'] = reduced_data[:, 1]
data_dict['label'] = np.argmax(model_data.values, axis=1)
data_dict['text'] = model_data.index
base_scheme = cl.scales['12']['qual']['Paired']
if not to_html:
drawing_handle = partial(iplot, show_link=False,)
save_path = None
else:
drawing_handle = partial(plot, show_link=False, output_type='div')
html_div = drawing_handle(
[go.Scatter(
x=data_dict['x'],
y=data_dict['y'],
mode='markers',
marker=dict(colorscale=base_scheme,
size=4,
opacity=0.6,
colorbar=dict(title='Topics')),
marker_color=data_dict['label'],
text=data_dict['text'],)],
)
if save_path is not None:
with open(save_path, 'w', encoding='utf-8') as f:
f.write(html_div)
if to_html:
return html_div
return reduced_data
def viev_from_jupyter(
self,
dataset,
method: str = 'TSNE',
save_path: str = 'DocumentCluster_view.html',
width: int = 800,
height: int = 600,
display_output: bool = True,
give_html: bool = False,
):
"""
Parameters
----------
dataset: Dataset
method: string
any of the methods in sklearn.manifold
to_html: Bool
if user wants the plot to be saved in html format
save_path: str
save path for the plot requires to be able to create
the visualisation
width
width of the plot in pixels
height
height of the plot in pixels
display_output
show the plot in the notebook
give_html
if return the html string (with javascript) that
performs the visualisation
Returns
-------
out_html: string
an html string containing the plotly graph
returned only if give_html is True
"""
from IPython.display import IFrame, display_html
out_html = self.view(
dataset=dataset,
save_path=save_path,
method=method,
to_html=True,
)
if display_output:
display_html(IFrame(save_path, width=width, height=height))
if give_html:
return out_html
Classes
class DocumentClusterViewer (model)
-
This viewer performs dimesionality reduction over document embeddings
Parameters
model
:TopicModel
Expand source code
class DocumentClusterViewer(BaseViewer): """ This viewer performs dimesionality reduction over document embeddings """ def __init__(self, model): """ Parameters ---------- model: TopicModel """ super().__init__(model=model) def view( self, dataset, save_path, method='TSNE', to_html=True, ): """ Parameters ---------- dataset: Dataset save_path: str save path for the plot method: string any of the methods in sklearn.manifold to_html: Bool if user wants the plot to be saved in html format Returns ------- reduced_data: an np.array of (num_docs, dim) dimensions reduced dumensions of the original document embeddings html_div: string an html string containing the plotly graph returned only if to_html is True """ from ..cooking_machine.dataset import BaseDataset if not isinstance(dataset, BaseDataset): raise TypeError('Parameter "dataset" should derive from BaseDataset') handler = getattr(clusterization, method,) bv = dataset.get_batch_vectorizer() model_data = self._model.transform(batch_vectorizer=bv).T reduced_data = handler(n_components=2).fit_transform(model_data) data_dict = {} data_dict['x'] = reduced_data[:, 0] data_dict['y'] = reduced_data[:, 1] data_dict['label'] = np.argmax(model_data.values, axis=1) data_dict['text'] = model_data.index base_scheme = cl.scales['12']['qual']['Paired'] if not to_html: drawing_handle = partial(iplot, show_link=False,) save_path = None else: drawing_handle = partial(plot, show_link=False, output_type='div') html_div = drawing_handle( [go.Scatter( x=data_dict['x'], y=data_dict['y'], mode='markers', marker=dict(colorscale=base_scheme, size=4, opacity=0.6, colorbar=dict(title='Topics')), marker_color=data_dict['label'], text=data_dict['text'],)], ) if save_path is not None: with open(save_path, 'w', encoding='utf-8') as f: f.write(html_div) if to_html: return html_div return reduced_data def viev_from_jupyter( self, dataset, method: str = 'TSNE', save_path: str = 'DocumentCluster_view.html', width: int = 800, height: int = 600, display_output: bool = True, give_html: bool = False, ): """ Parameters ---------- dataset: Dataset method: string any of the methods in sklearn.manifold to_html: Bool if user wants the plot to be saved in html format save_path: str save path for the plot requires to be able to create the visualisation width width of the plot in pixels height height of the plot in pixels display_output show the plot in the notebook give_html if return the html string (with javascript) that performs the visualisation Returns ------- out_html: string an html string containing the plotly graph returned only if give_html is True """ from IPython.display import IFrame, display_html out_html = self.view( dataset=dataset, save_path=save_path, method=method, to_html=True, ) if display_output: display_html(IFrame(save_path, width=width, height=height)) if give_html: return out_html
Ancestors
Methods
def viev_from_jupyter(self, dataset, method: str = 'TSNE', save_path: str = 'DocumentCluster_view.html', width: int = 800, height: int = 600, display_output: bool = True, give_html: bool = False)
-
Parameters
dataset
:Dataset
method
:string
- any of the methods in sklearn.manifold
to_html
:Bool
- if user wants the plot to be saved in html format
save_path
:str
- save path for the plot requires to be able to create the visualisation
width
- width of the plot in pixels
height
- height of the plot in pixels
display_output
- show the plot in the notebook
give_html
- if return the html string (with javascript) that performs the visualisation
Returns
out_html
:string
- an html string containing the plotly graph returned only if give_html is True
Expand source code
def viev_from_jupyter( self, dataset, method: str = 'TSNE', save_path: str = 'DocumentCluster_view.html', width: int = 800, height: int = 600, display_output: bool = True, give_html: bool = False, ): """ Parameters ---------- dataset: Dataset method: string any of the methods in sklearn.manifold to_html: Bool if user wants the plot to be saved in html format save_path: str save path for the plot requires to be able to create the visualisation width width of the plot in pixels height height of the plot in pixels display_output show the plot in the notebook give_html if return the html string (with javascript) that performs the visualisation Returns ------- out_html: string an html string containing the plotly graph returned only if give_html is True """ from IPython.display import IFrame, display_html out_html = self.view( dataset=dataset, save_path=save_path, method=method, to_html=True, ) if display_output: display_html(IFrame(save_path, width=width, height=height)) if give_html: return out_html
def view(self, dataset, save_path, method='TSNE', to_html=True)
-
Parameters
dataset
:Dataset
save_path
:str
- save path for the plot
method
:string
- any of the methods in sklearn.manifold
to_html
:Bool
- if user wants the plot to be saved in html format
Returns
reduced_data
:an np.array
of(num_docs, dim) dimensions
- reduced dumensions of the original document embeddings
html_div
:string
- an html string containing the plotly graph returned only if to_html is True
Expand source code
def view( self, dataset, save_path, method='TSNE', to_html=True, ): """ Parameters ---------- dataset: Dataset save_path: str save path for the plot method: string any of the methods in sklearn.manifold to_html: Bool if user wants the plot to be saved in html format Returns ------- reduced_data: an np.array of (num_docs, dim) dimensions reduced dumensions of the original document embeddings html_div: string an html string containing the plotly graph returned only if to_html is True """ from ..cooking_machine.dataset import BaseDataset if not isinstance(dataset, BaseDataset): raise TypeError('Parameter "dataset" should derive from BaseDataset') handler = getattr(clusterization, method,) bv = dataset.get_batch_vectorizer() model_data = self._model.transform(batch_vectorizer=bv).T reduced_data = handler(n_components=2).fit_transform(model_data) data_dict = {} data_dict['x'] = reduced_data[:, 0] data_dict['y'] = reduced_data[:, 1] data_dict['label'] = np.argmax(model_data.values, axis=1) data_dict['text'] = model_data.index base_scheme = cl.scales['12']['qual']['Paired'] if not to_html: drawing_handle = partial(iplot, show_link=False,) save_path = None else: drawing_handle = partial(plot, show_link=False, output_type='div') html_div = drawing_handle( [go.Scatter( x=data_dict['x'], y=data_dict['y'], mode='markers', marker=dict(colorscale=base_scheme, size=4, opacity=0.6, colorbar=dict(title='Topics')), marker_color=data_dict['label'], text=data_dict['text'],)], ) if save_path is not None: with open(save_path, 'w', encoding='utf-8') as f: f.write(html_div) if to_html: return html_div return reduced_data