Skip to content
Snippets Groups Projects
Representation Learning.ipynb 336 KiB
Newer Older
       "    ws.close = function () {\n",
       "        comm.close();\n",
       "    };\n",
       "    ws.send = function (m) {\n",
       "        //console.log('sending', m);\n",
       "        comm.send(m);\n",
       "    };\n",
       "    // Register the callback with on_msg.\n",
       "    comm.on_msg(function (msg) {\n",
       "        //console.log('receiving', msg['content']['data'], msg);\n",
       "        // Pass the mpl event to the overridden (by mpl) onmessage function.\n",
       "        ws.onmessage(msg['content']['data']);\n",
       "    });\n",
       "    return ws;\n",
       "};\n",
       "\n",
       "mpl.mpl_figure_comm = function (comm, msg) {\n",
       "    // This is the function which gets called when the mpl process\n",
       "    // starts-up an IPython Comm through the \"matplotlib\" channel.\n",
       "\n",
       "    var id = msg.content.data.id;\n",
       "    // Get hold of the div created by the display call when the Comm\n",
       "    // socket was opened in Python.\n",
       "    var element = document.getElementById(id);\n",
       "    var ws_proxy = comm_websocket_adapter(comm);\n",
       "\n",
       "    function ondownload(figure, _format) {\n",
       "        window.open(figure.canvas.toDataURL());\n",
       "    }\n",
       "\n",
       "    var fig = new mpl.figure(id, ws_proxy, ondownload, element);\n",
       "\n",
       "    // Call onopen now - mpl needs it, as it is assuming we've passed it a real\n",
       "    // web socket which is closed, not our websocket->open comm proxy.\n",
       "    ws_proxy.onopen();\n",
       "\n",
       "    fig.parent_element = element;\n",
       "    fig.cell_info = mpl.find_output_cell(\"<div id='\" + id + \"'></div>\");\n",
       "    if (!fig.cell_info) {\n",
       "        console.error('Failed to find cell for figure', id, fig);\n",
       "        return;\n",
       "    }\n",
       "    fig.cell_info[0].output_area.element.on(\n",
       "        'cleared',\n",
       "        { fig: fig },\n",
       "        fig._remove_fig_handler\n",
       "    );\n",
       "};\n",
       "\n",
       "mpl.figure.prototype.handle_close = function (fig, msg) {\n",
       "    var width = fig.canvas.width / fig.ratio;\n",
       "    fig.cell_info[0].output_area.element.off(\n",
       "        'cleared',\n",
       "        fig._remove_fig_handler\n",
       "    );\n",
       "    fig.resizeObserverInstance.unobserve(fig.canvas_div);\n",
       "\n",
       "    // Update the output cell to use the data from the current canvas.\n",
       "    fig.push_to_output();\n",
       "    var dataURL = fig.canvas.toDataURL();\n",
       "    // Re-enable the keyboard manager in IPython - without this line, in FF,\n",
       "    // the notebook keyboard shortcuts fail.\n",
       "    IPython.keyboard_manager.enable();\n",
       "    fig.parent_element.innerHTML =\n",
       "        '<img src=\"' + dataURL + '\" width=\"' + width + '\">';\n",
       "    fig.close_ws(fig, msg);\n",
       "};\n",
       "\n",
       "mpl.figure.prototype.close_ws = function (fig, msg) {\n",
       "    fig.send_message('closing', msg);\n",
       "    // fig.ws.close()\n",
       "};\n",
       "\n",
       "mpl.figure.prototype.push_to_output = function (_remove_interactive) {\n",
       "    // Turn the data on the canvas into data in the output cell.\n",
       "    var width = this.canvas.width / this.ratio;\n",
       "    var dataURL = this.canvas.toDataURL();\n",
       "    this.cell_info[1]['text/html'] =\n",
       "        '<img src=\"' + dataURL + '\" width=\"' + width + '\">';\n",
       "};\n",
       "\n",
       "mpl.figure.prototype.updated_canvas_event = function () {\n",
       "    // Tell IPython that the notebook contents must change.\n",
       "    IPython.notebook.set_dirty(true);\n",
       "    this.send_message('ack', {});\n",
       "    var fig = this;\n",
       "    // Wait a second, then push the new image to the DOM so\n",
       "    // that it is saved nicely (might be nice to debounce this).\n",
       "    setTimeout(function () {\n",
       "        fig.push_to_output();\n",
       "    }, 1000);\n",
       "};\n",
       "\n",
       "mpl.figure.prototype._init_toolbar = function () {\n",
       "    var fig = this;\n",
       "\n",
       "    var toolbar = document.createElement('div');\n",
       "    toolbar.classList = 'btn-toolbar';\n",
       "    this.root.appendChild(toolbar);\n",
       "\n",
       "    function on_click_closure(name) {\n",
       "        return function (_event) {\n",
       "            return fig.toolbar_button_onclick(name);\n",
       "        };\n",
       "    }\n",
       "\n",
       "    function on_mouseover_closure(tooltip) {\n",
       "        return function (event) {\n",
       "            if (!event.currentTarget.disabled) {\n",
       "                return fig.toolbar_button_onmouseover(tooltip);\n",
       "            }\n",
       "        };\n",
       "    }\n",
       "\n",
       "    fig.buttons = {};\n",
       "    var buttonGroup = document.createElement('div');\n",
       "    buttonGroup.classList = 'btn-group';\n",
       "    var button;\n",
       "    for (var toolbar_ind in mpl.toolbar_items) {\n",
       "        var name = mpl.toolbar_items[toolbar_ind][0];\n",
       "        var tooltip = mpl.toolbar_items[toolbar_ind][1];\n",
       "        var image = mpl.toolbar_items[toolbar_ind][2];\n",
       "        var method_name = mpl.toolbar_items[toolbar_ind][3];\n",
       "\n",
       "        if (!name) {\n",
       "            /* Instead of a spacer, we start a new button group. */\n",
       "            if (buttonGroup.hasChildNodes()) {\n",
       "                toolbar.appendChild(buttonGroup);\n",
       "            }\n",
       "            buttonGroup = document.createElement('div');\n",
       "            buttonGroup.classList = 'btn-group';\n",
       "            continue;\n",
       "        }\n",
       "\n",
       "        button = fig.buttons[name] = document.createElement('button');\n",
       "        button.classList = 'btn btn-default';\n",
       "        button.href = '#';\n",
       "        button.title = name;\n",
       "        button.innerHTML = '<i class=\"fa ' + image + ' fa-lg\"></i>';\n",
       "        button.addEventListener('click', on_click_closure(method_name));\n",
       "        button.addEventListener('mouseover', on_mouseover_closure(tooltip));\n",
       "        buttonGroup.appendChild(button);\n",
       "    }\n",
       "\n",
       "    if (buttonGroup.hasChildNodes()) {\n",
       "        toolbar.appendChild(buttonGroup);\n",
       "    }\n",
       "\n",
       "    // Add the status bar.\n",
       "    var status_bar = document.createElement('span');\n",
       "    status_bar.classList = 'mpl-message pull-right';\n",
       "    toolbar.appendChild(status_bar);\n",
       "    this.message = status_bar;\n",
       "\n",
       "    // Add the close button to the window.\n",
       "    var buttongrp = document.createElement('div');\n",
       "    buttongrp.classList = 'btn-group inline pull-right';\n",
       "    button = document.createElement('button');\n",
       "    button.classList = 'btn btn-mini btn-primary';\n",
       "    button.href = '#';\n",
       "    button.title = 'Stop Interaction';\n",
       "    button.innerHTML = '<i class=\"fa fa-power-off icon-remove icon-large\"></i>';\n",
       "    button.addEventListener('click', function (_evt) {\n",
       "        fig.handle_close(fig, {});\n",
       "    });\n",
       "    button.addEventListener(\n",
       "        'mouseover',\n",
       "        on_mouseover_closure('Stop Interaction')\n",
       "    );\n",
       "    buttongrp.appendChild(button);\n",
       "    var titlebar = this.root.querySelector('.ui-dialog-titlebar');\n",
       "    titlebar.insertBefore(buttongrp, titlebar.firstChild);\n",
       "};\n",
       "\n",
       "mpl.figure.prototype._remove_fig_handler = function (event) {\n",
       "    var fig = event.data.fig;\n",
       "    if (event.target !== this) {\n",
       "        // Ignore bubbled events from children.\n",
       "        return;\n",
       "    }\n",
       "    fig.close_ws(fig, {});\n",
       "};\n",
       "\n",
       "mpl.figure.prototype._root_extra_style = function (el) {\n",
       "    el.style.boxSizing = 'content-box'; // override notebook setting of border-box.\n",
       "};\n",
       "\n",
       "mpl.figure.prototype._canvas_extra_style = function (el) {\n",
       "    // this is important to make the div 'focusable\n",
       "    el.setAttribute('tabindex', 0);\n",
       "    // reach out to IPython and tell the keyboard manager to turn it's self\n",
       "    // off when our div gets focus\n",
       "\n",
       "    // location in version 3\n",
       "    if (IPython.notebook.keyboard_manager) {\n",
       "        IPython.notebook.keyboard_manager.register_events(el);\n",
       "    } else {\n",
       "        // location in version 2\n",
       "        IPython.keyboard_manager.register_events(el);\n",
       "    }\n",
       "};\n",
       "\n",
       "mpl.figure.prototype._key_event_extra = function (event, _name) {\n",
       "    var manager = IPython.notebook.keyboard_manager;\n",
       "    if (!manager) {\n",
       "        manager = IPython.keyboard_manager;\n",
       "    }\n",
       "\n",
       "    // Check for shift+enter\n",
       "    if (event.shiftKey && event.which === 13) {\n",
       "        this.canvas_div.blur();\n",
       "        // select the cell after this one\n",
       "        var index = IPython.notebook.find_cell_index(this.cell_info[0]);\n",
       "        IPython.notebook.select(index + 1);\n",
       "    }\n",
       "};\n",
       "\n",
       "mpl.figure.prototype.handle_save = function (fig, _msg) {\n",
       "    fig.ondownload(fig, null);\n",
       "};\n",
       "\n",
       "mpl.find_output_cell = function (html_output) {\n",
       "    // Return the cell and output element which can be found *uniquely* in the notebook.\n",
       "    // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n",
       "    // IPython event is triggered only after the cells have been serialised, which for\n",
       "    // our purposes (turning an active figure into a static one), is too late.\n",
       "    var cells = IPython.notebook.get_cells();\n",
       "    var ncells = cells.length;\n",
       "    for (var i = 0; i < ncells; i++) {\n",
       "        var cell = cells[i];\n",
       "        if (cell.cell_type === 'code') {\n",
       "            for (var j = 0; j < cell.output_area.outputs.length; j++) {\n",
       "                var data = cell.output_area.outputs[j];\n",
       "                if (data.data) {\n",
       "                    // IPython >= 3 moved mimebundle to data attribute of output\n",
       "                    data = data.data;\n",
       "                }\n",
       "                if (data['text/html'] === html_output) {\n",
       "                    return [cell, data, j];\n",
       "                }\n",
       "            }\n",
       "        }\n",
       "    }\n",
       "};\n",
       "\n",
       "// Register the function which deals with the matplotlib target/channel.\n",
       "// The kernel may be null if the page has been refreshed.\n",
       "if (IPython.notebook.kernel !== null) {\n",
       "    IPython.notebook.kernel.comm_manager.register_target(\n",
       "        'matplotlib',\n",
       "        mpl.mpl_figure_comm\n",
       "    );\n",
       "}\n"
      ],
      "text/plain": [
       "<IPython.core.display.Javascript object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<img src=\"\" width=\"1000\">"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.barh([\"PCA 1\", \"PCA 2\"], pca.explained_variance_)\n",
    "plt.gca().set(xlabel=\"Variance\", ylabel=\"Component\", title=\"Explained variance\")\n",
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Clearly the zeroth PCA component holds most of the variance and we could therefore use this component to closely determine where a data sample point should be if we dropped the other component. This is equivalent to fitting a line and using the projection of the point in the line to characterize the data, instead of using the two coordinates.\n",
    "\n",
    "While this may seem superfluous in this simple case, if one has hundreds of variables, PCA provides a simple and almost automatic way to reduce the amount of features being examined, by concentrating most of the variance in a few variables. Which variables to choose, can be decided from the `explained_variance_` attribute."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bcb1e2c",
   "metadata": {},
   "source": [
    "## Other representation learning methods\n",
    "\n",
    "While PCA is a great method for both data visualization and choosing a latent space to condense the information contained in the data, there are many other available, which differ on the assumptions made. Here are some of them and easy-to-read and practical references on how to use them and which assumptions they entail.\n",
    "\n",
    "  * Kernel PCA: While PCA does a great job at finding the directions where most of the variance is, it focuses on making only *linear* combinations of the features to find the new representation directions. A straightforward method to generalise this idea to non-linear transformations is to apply a non-linear transformation to the data features and after that apply PCA. This is similar to what Kernel PCA does, generalizing the PCA idea. Of course, a non-linearity transformation needs to be chosen for this and this adds extra assumptions in the model. Some more on this can be read here: https://scikit-learn.org/stable/modules/decomposition.html#kernel-pca\n",
    "  \n",
    "  * Independent Component Analysis: PCA  looks for the direction containing most of the variance, but this is not always the optimal representation of the data for all goals. In some cases, independence is much more important than decorrelation and this is where the Independent Component Analysis comes in. The ICA also assumes that the new representation can be built from a linear combination of the existing data, but it assumes additionally each observed feature comes from a linear combination of independent latent representations we want to discover. Since independence is a very strong requirement, it is imposed through various different proxy methods. One set of methods focuses on reducing the mutual information (as defined in statistics) between the new features, while an alternative imposes non-Gaussianity requirements on the underlying latent features to be discovered. Since Gaussians do not have statistical moments above the second-order moment (covariance), one can require for example, the fourth order statistical moment (kurtosis) to be maximized in the new representation. More on the scikit-learn implementation can be found here: https://scikit-learn.org/stable/modules/decomposition.html#ica .\n",
    "  \n",
    "  * t-SNE embedding: If the objective is only to visualize the data, there are many alternative solutions which focus on reducing the dimensionality of the data into a 2D representation. The t-SNE method assumes that a Gaussian probability can be used to model similarity between data points in N dimensions and that one should maintain that similarity measure when projecting the data in two dimensions, assuming however that the 2D data points' similarity can be represented with a t-Student distribution. Full details on the method can be seen here: https://jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf It can be easily tried in scikit-learn following the procedure here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18c90048",
   "metadata": {},
   "source": [
    "### Contact us at the EuXFEL Data Analysis group at any time if you need help analysing your data!\n",
    "\n",
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",