Add original Timepix centroiding notebook for reference

7b9a301d · Philipp Schmidt · 130a3247 · 7b9a301d
Commit 7b9a301d authored 3 years ago by Philipp Schmidt
--- a/notebooks/Timepix/Compute_Timepix_Event_Centroids.ipynb
+++ b/notebooks/Timepix/Compute_Timepix_Event_Centroids.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4d04f210-fc4a-4b1d-9dfe-9c85b94b884f",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "524fe654-e112-4abe-813c-a0be9b3a3034",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import time\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from extra_data import open_run, RunDirectory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "392aa413-0035-4861-9f1f-03abb51d0613",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib notebook\n",
+    "p900256"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3911b4e1-11af-4a51-be59-103ca9f6f261",
+   "metadata": {},
+   "source": [
+    "## Centroiding functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e36f997c-4b66-4b11-99a8-5887e3572f56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# centroiding\n",
+    "\n",
+    "import numpy as np\n",
+    "import scipy.ndimage as nd\n",
+    "from sklearn.cluster import DBSCAN\n",
+    "from warnings import warn\n",
+    "import traceback as tb\n",
+    "\n",
+    "error_msgs = {\n",
+    "            -1: \"tpx_data has an invalid structure - ignore provided data\",\n",
+    "            -2: \"tpx_data arrays are of invalid lengths - ignore provided data\",\n",
+    "            -3: \"tpx_data arrays are empty\"\n",
+    "        }\n",
+    "\n",
+    "def check_data(tpx_data):\n",
+    "    required_keys = [\"x\", \"y\", \"tof\", \"tot\"]\n",
+    "    for key in required_keys:\n",
+    "        if key not in tpx_data.keys():\n",
+    "            warn(\"tpx data must contain the keys %s, but key %s not in tpx data keys (%s)\" % (required_keys, key, list(tpx_data.keys())),\n",
+    "                 category=UserWarning)\n",
+    "            return -1\n",
+    "\n",
+    "    reference_n_samples_key = \"x\"\n",
+    "    n_samples = len(tpx_data[reference_n_samples_key])\n",
+    "    for key in tpx_data.keys():\n",
+    "        if n_samples != len(tpx_data[key]):\n",
+    "            warn(\"arrays in tpx data must be of same length ( len(tpx_data[%s])=%i!=%i=(len(tpx_data[%s]) )\" % (reference_n_samples_key, n_samples, len(tpx_data[key]), key),\n",
+    "                 category=UserWarning)\n",
+    "            return -2\n",
+    "    if n_samples == 0:\n",
+    "        warn(\"no samples were provides with tpx data\", category=UserWarning)\n",
+    "        return -3\n",
+    "    return 0\n",
+    "\n",
+    "def apply_single_filter(tpx_data, _filter):\n",
+    "    \"\"\"\n",
+    "    Simple function to apply a selecting or sorting filter to a dictionary of equally sized arrays\n",
+    "    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tpx_data: dictionary with timepix data, all arrays behind each key must be of same length\n",
+    "    _filter:  1d array or list of integers or booleans or np.s_ to select or sort data like a = a[_filter]\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    tpx_data: like input tpx_data but with applied filter\n",
+    "\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        for key in tpx_data.keys():\n",
+    "            tpx_data[key] = np.array(tpx_data[key])[_filter]\n",
+    "    except Exception as e:\n",
+    "        # print(tpx_data)\n",
+    "        print(_filter)\n",
+    "        print(_filter.dtype)\n",
+    "        print(_filter.shape)\n",
+    "        print(tpx_data[key].shape)\n",
+    "        raise e\n",
+    "    return tpx_data\n",
+    "\n",
+    "def pre_clustering_filter(tpx_data, tot_threshold=None):\n",
+    "    \"\"\"\n",
+    "    Collection of filters directly applied before clustering.\n",
+    "    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tpx_data:      dictionary with timepix data, all arrays behind each key must be of same length\n",
+    "    tot_threshold:\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    tpx_data: like input tpx_data but with applied filters\n",
+    "    \"\"\"\n",
+    "    if tot_threshold is not None:\n",
+    "        tpx_data = apply_single_filter(tpx_data, tpx_data[\"tot\"] >= tot_threshold)\n",
+    "\n",
+    "    return tpx_data\n",
+    "\n",
+    "def post_clustering_filter(tpx_data):\n",
+    "    \"\"\"\n",
+    "    Collection of filters directly applied after clustering.\n",
+    "    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tpx_data:    dictionary with timepix data, all arrays behind each key must be of same length, now with key labels\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    tpx_data: like input tpx_data but with applied filters\n",
+    "    \"\"\"\n",
+    "    if tpx_data[\"labels\"] is not None:\n",
+    "        tpx_data = apply_single_filter(tpx_data, tpx_data[\"labels\"] != 0)\n",
+    "\n",
+    "    return tpx_data\n",
+    "\n",
+    "def clustering(tpx_data, epsilon=2, tof_scale=1e7, min_samples=3, n_jobs=1):\n",
+    "    \"\"\"\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tpx_data\n",
+    "    epsilon\n",
+    "    tof_scale\n",
+    "    min_samples\n",
+    "    n_jobs\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "\n",
+    "    \"\"\"\n",
+    "    coords = np.column_stack((tpx_data[\"x\"], tpx_data[\"y\"], tpx_data[\"tof\"]*tof_scale))\n",
+    "    dist = DBSCAN(eps=epsilon, min_samples=min_samples, metric=\"euclidean\", n_jobs=n_jobs).fit(coords)\n",
+    "    return dist.labels_ + 1\n",
+    "\n",
+    "def empty_centroid_data():\n",
+    "    return {\n",
+    "        \"x\": np.array([]),\n",
+    "        \"y\": np.array([]),\n",
+    "        \"tof\": np.array([]),\n",
+    "        \"tot_avg\": np.array([]),\n",
+    "        \"tot_max\": np.array([]),\n",
+    "        \"size\": np.array([]),\n",
+    "    }\n",
+    "\n",
+    "def get_centroids(tpx_data, timewalk_lut=None):\n",
+    "    centroid_data = empty_centroid_data()\n",
+    "    cluster_labels, cluster_size = np.unique(tpx_data[\"labels\"], return_counts=True)\n",
+    "\n",
+    "    cluster_tot_peaks = np.array(nd.maximum_position(tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels)).ravel()\n",
+    "    cluster_tot_integrals = nd.sum(tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels)\n",
+    "\n",
+    "    # compute centroid center through weighted average\n",
+    "    centroid_data[\"x\"] = np.array(nd.sum(tpx_data[\"x\"] * tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels) / cluster_tot_integrals).ravel()\n",
+    "    centroid_data[\"y\"] = np.array(nd.sum(tpx_data[\"y\"] * tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels) / cluster_tot_integrals).ravel()\n",
+    "    centroid_data[\"tof\"] = np.array(nd.sum(tpx_data[\"tof\"] * tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels) / cluster_tot_integrals).ravel()\n",
+    "\n",
+    "    # intensity & size information\n",
+    "    centroid_data[\"tot_avg\"] = np.array(nd.mean(tpx_data[\"tot\"], labels=tpx_data[\"labels\"], index=cluster_labels))\n",
+    "    centroid_data[\"tot_max\"] = tpx_data[\"tot\"][cluster_tot_peaks]\n",
+    "    centroid_data[\"tot\"] = np.array(cluster_tot_integrals)\n",
+    "    centroid_data[\"size\"] = cluster_size\n",
+    "\n",
+    "    # train ID information\n",
+    "    # ~ centroid_data[\"tid\"] = tpx_data[\"tid\"][cluster_tot_peaks]\n",
+    "\n",
+    "    # correct for timewalk if provided\n",
+    "    if timewalk_lut is not None:\n",
+    "        centroid_data[\"tof\"] -= timewalk_lut[np.int_(centroid_data[\"tot_max\"] // 25) - 1] * 1e3\n",
+    "    return centroid_data\n",
+    "\n",
+    "def compute_centroids(x, y, tof, tot,\n",
+    "                      threshold_tot=None,\n",
+    "                      clustering_epsilon=2,\n",
+    "                      clustering_tof_scale=1e7,\n",
+    "                      clustering_min_samples=3,\n",
+    "                      clustering_n_jobs=1,\n",
+    "                      centroiding_timewalk_lut=None):\n",
+    "    # format input data\n",
+    "    _tpx_data = {\n",
+    "        \"x\": x,\n",
+    "        \"y\": y,\n",
+    "        \"tof\": tof,\n",
+    "        \"tot\": tot\n",
+    "    }\n",
+    "\n",
+    "    # ensure that valid data is available\n",
+    "    data_validation = check_data(_tpx_data)\n",
+    "    if data_validation < 0:\n",
+    "        if data_validation in error_msgs.keys():\n",
+    "            print(\"Data validation failed with message: %s\" % error_msgs[data_validation])\n",
+    "        else:\n",
+    "            print(\"Data validation failed: unknown reason\")\n",
+    "        return None\n",
+    "\n",
+    "    # clustering (identify clusters in 2d data (x,y,tof) that belong to a single hit,\n",
+    "    # each sample belonging to a cluster is labeled with an integer cluster id no)\n",
+    "    _tpx_data = pre_clustering_filter(_tpx_data, tot_threshold=threshold_tot)\n",
+    "    _tpx_data[\"labels\"] = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples, n_jobs=clustering_n_jobs)\n",
+    "    _tpx_data = post_clustering_filter(_tpx_data)\n",
+    "    # compute centroid data (reduce cluster of samples to a single point with properties)\n",
+    "    if _tpx_data[\"labels\"] is None or _tpx_data[\"labels\"].size == 0:\n",
+    "        # handle case of no identified clusters, return empty dictionary with expected keys\n",
+    "        return empty_centroid_data()\n",
+    "    _centroids = get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)\n",
+    "    return _centroids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f54e558-d862-48af-86b9-7d9e3d53fd3d",
+   "metadata": {},
+   "source": [
+    "## Output Function (for illustration) - instead data should be written to output file train-id wise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83acf826-e351-4a39-a870-6c1e0298e711",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def output(train_id, centroids, fixed_len=None):\n",
+    "    \"\"\"\n",
+    "    In this version the resulting vectors within a run vary in length,\n",
+    "    if fixed len is provided fixed length vectors are generated\n",
+    "    \"\"\"\n",
+    "    if fixed_len is None:\n",
+    "        out = {\n",
+    "            \"%s[data.trainId]\"%event_output_key : train_id,\n",
+    "            \"%s[data.x]\"%event_output_key : centroids[\"x\"],\n",
+    "            \"%s[data.y]\"%event_output_key : centroids[\"y\"],\n",
+    "            \"%s[data.toa]\"%event_output_key : centroids[\"tof\"],\n",
+    "            \"%s[data.tot_avg]\"%event_output_key : centroids[\"tot_avg\"],\n",
+    "            \"%s[data.tot_max]\"%event_output_key : centroids[\"tot_max\"],\n",
+    "            \"%s[data.size]\"%event_output_key : centroids[\"size\"],\n",
+    "        }\n",
+    "    else:\n",
+    "        if len(centroids[\"x\"])>fixed_len:\n",
+    "            sel = np.s_[:fixed_len]\n",
+    "            out = {\n",
+    "                \"%s[data.trainId]\"%event_output_key : train_id,\n",
+    "                \"%s[data.x]\"%event_output_key : centroids[\"x\"][sel],\n",
+    "                \"%s[data.y]\"%event_output_key : centroids[\"y\"][sel],\n",
+    "                \"%s[data.toa]\"%event_output_key : centroids[\"tof\"][sel],\n",
+    "                \"%s[data.tot_avg]\"%event_output_key : centroids[\"tot_avg\"][sel],\n",
+    "                \"%s[data.tot_max]\"%event_output_key : centroids[\"tot_max\"][sel],\n",
+    "                \"%s[data.centroid_size]\"%event_output_key : centroids[\"size\"][sel],\n",
+    "                \"%s[data.size]\"%event_output_key : len(centroids[\"x\"]),\n",
+    "            }\n",
+    "        else:\n",
+    "            pad = fixed_len - len(centroids[\"x\"])\n",
+    "            out = {\n",
+    "                \"%s[data.trainId]\"%event_output_key : train_id, # int 0d\n",
+    "                \"%s[data.x]\"%event_output_key : np.pad(centroids[\"x\"], (0,pad), constant_values=1),  # float 1d\n",
+    "                \"%s[data.y]\"%event_output_key : np.pad(centroids[\"y\"], (0,pad), constant_values=1),  # float 1d\n",
+    "                \"%s[data.toa]\"%event_output_key : np.pad(centroids[\"tof\"], (0,pad), constant_values=1),  # float 1d\n",
+    "                \"%s[data.tot_avg]\"%event_output_key : np.pad(centroids[\"tot_avg\"], (0,pad), constant_values=1),  # float 1d\n",
+    "                \"%s[data.tot_max]\"%event_output_key : np.pad(centroids[\"tot_max\"], (0,pad), constant_values=1),  # float 1d\n",
+    "                \"%s[data.centroid_size]\"%event_output_key : np.pad(centroids[\"size\"], (0,pad), constant_values=1),  # int 1d\n",
+    "                \"%s[data.size]\"%event_output_key : len(centroids[\"x\"]),  # int 0d\n",
+    "            }\n",
+    "#     print(out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a39ce0e4-31b3-40c7-9c80-72b89d037714",
+   "metadata": {},
+   "source": [
+    "## Settings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8318bc3f-01be-4144-a4de-ebd736563396",
+   "metadata": {},
+   "source": [
+    "### Data selection (for dev)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "693c72cd-9c67-46d6-a0bc-438126e62642",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data selection parameters.\n",
+    "run_id = 420\n",
+    "proposal = 900256\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13956e56-d818-42e4-aa73-7f9c4a897732",
+   "metadata": {},
+   "source": [
+    "### Parameters to be exposed with calibration pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9adf469-32de-43b4-91fa-2508a952d0dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Centroiding and output parameters that should be easily adjustable\n",
+    "max_n_centroids = 10000\n",
+    "\n",
+    "clustering_epsilon = 2\n",
+    "clustering_tof_scale = 1e7\n",
+    "clustering_min_samples = 3\n",
+    "clustering_n_jobs = 1\n",
+    "threshold_tot = None\n",
+    "raw_timewalk_lut_filepath = None  # fpath to look up table for timewalk correction or None\n",
+    "centroiding_timewalk_lut_filepath = None  # fpath to look up table for timewalk correction or None\n",
+    "\n",
+    "# raw_timewalk_lut_filepath = \"timewalk_raw.npy\"  # fpath to look up table for timewalk correction or None\n",
+    "# centroiding_timewalk_lut_filepath = \"timewalk_cent.npy\"  # fpath to look up table for timewalk correction or None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6dff850-18c8-4d5a-9362-e581cce18d8e",
+   "metadata": {},
+   "source": [
+    "## Centroiding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56306c15-513e-4e7f-9c47-c52ca61b27a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run = open_run(run=run_id, proposal=proposal)\n",
+    "event_output_key = 'SQS_AQS_CAM/CAM/TIMEPIX3:daqEventOutput'\n",
+    "\n",
+    "if raw_timewalk_lut_filepath is not None:\n",
+    "    raw_timewalk_lut = np.load(raw_timewalk_lut_filepath)\n",
+    "else:\n",
+    "    raw_timewalk_lut = None\n",
+    "\n",
+    "if centroiding_timewalk_lut_filepath is not None:\n",
+    "    centroiding_timewalk_lut = np.load(centroiding_timewalk_lut_filepath)\n",
+    "else:\n",
+    "    centroiding_timewalk_lut = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c28a4c8-961c-496b-80da-7fd867e5b0d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t00 = time()\n",
+    "for train_id, data in run.select(event_output_key, \"data.*\").trains():\n",
+    "    t0 = time()\n",
+    "    sel = np.s_[:data[event_output_key][\"data.size\"]]\n",
+    "    \n",
+    "    tot = np.array(data[event_output_key][\"data.tot\"][sel])\n",
+    "    toa = np.array(data[event_output_key][\"data.toa\"][sel])\n",
+    "    \n",
+    "    if raw_timewalk_lut is not None:\n",
+    "        toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3\n",
+    "    \n",
+    "    centroids = compute_centroids(data[event_output_key][\"data.x\"][sel], \n",
+    "                                  data[event_output_key][\"data.y\"][sel], \n",
+    "                                  toa, \n",
+    "                                  tot,\n",
+    "                                  threshold_tot=threshold_tot,\n",
+    "                                  clustering_epsilon=clustering_epsilon,\n",
+    "                                  clustering_tof_scale=clustering_tof_scale,\n",
+    "                                  clustering_min_samples=clustering_min_samples,\n",
+    "                                  clustering_n_jobs=clustering_n_jobs,\n",
+    "                                  centroiding_timewalk_lut=centroiding_timewalk_lut)\n",
+    "    \n",
+    "#     print(time()-t0)\n",
+    "    output(train_id, centroids, fixed_len=max_n_centroids)\n",
+    "#     if time()-t00 > 0:\n",
+    "#         break\n",
+    "print(time()-t00)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pycal",
+   "language": "python",
+   "name": "pycal"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:4d04f210-fc4a-4b1d-9dfe-9c85b94b884f tags:
+## Imports
+%% Cell type:code id:524fe654-e112-4abe-813c-a0be9b3a3034 tags:
+``` python
+from time import time
+import numpy as np
+from extra_data import open_run, RunDirectory
+```
+%% Cell type:code id:392aa413-0035-4861-9f1f-03abb51d0613 tags:
+``` python
+%matplotlib notebook
+p900256
+```
+%% Cell type:markdown id:3911b4e1-11af-4a51-be59-103ca9f6f261 tags:
+## Centroiding functions
+%% Cell type:code id:e36f997c-4b66-4b11-99a8-5887e3572f56 tags:
+``` python
+# centroiding
+import numpy as np
+import scipy.ndimage as nd
+from sklearn.cluster import DBSCAN
+from warnings import warn
+import traceback as tb
+error_msgs = {
+            -1: "tpx_data has an invalid structure - ignore provided data",
+            -2: "tpx_data arrays are of invalid lengths - ignore provided data",
+            -3: "tpx_data arrays are empty"
+        }
+def check_data(tpx_data):
+    required_keys = ["x", "y", "tof", "tot"]
+    for key in required_keys:
+        if key not in tpx_data.keys():
+            warn("tpx data must contain the keys %s, but key %s not in tpx data keys (%s)" % (required_keys, key, list(tpx_data.keys())),
+                 category=UserWarning)
+            return -1
+    reference_n_samples_key = "x"
+    n_samples = len(tpx_data[reference_n_samples_key])
+    for key in tpx_data.keys():
+        if n_samples != len(tpx_data[key]):
+            warn("arrays in tpx data must be of same length ( len(tpx_data[%s])=%i!=%i=(len(tpx_data[%s]) )" % (reference_n_samples_key, n_samples, len(tpx_data[key]), key),
+                 category=UserWarning)
+            return -2
+    if n_samples == 0:
+        warn("no samples were provides with tpx data", category=UserWarning)
+        return -3
+    return 0
+def apply_single_filter(tpx_data, _filter):
+    """
+    Simple function to apply a selecting or sorting filter to a dictionary of equally sized arrays
+    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
+    Parameters
+    ----------
+    tpx_data: dictionary with timepix data, all arrays behind each key must be of same length
+    _filter:  1d array or list of integers or booleans or np.s_ to select or sort data like a = a[_filter]
+    Returns
+    -------
+    tpx_data: like input tpx_data but with applied filter
+    """
+    try:
+        for key in tpx_data.keys():
+            tpx_data[key] = np.array(tpx_data[key])[_filter]
+    except Exception as e:
+        # print(tpx_data)
+        print(_filter)
+        print(_filter.dtype)
+        print(_filter.shape)
+        print(tpx_data[key].shape)
+        raise e
+    return tpx_data
+def pre_clustering_filter(tpx_data, tot_threshold=None):
+    """
+    Collection of filters directly applied before clustering.
+    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
+    Parameters
+    ----------
+    tpx_data:      dictionary with timepix data, all arrays behind each key must be of same length
+    tot_threshold:
+    Returns
+    -------
+    tpx_data: like input tpx_data but with applied filters
+    """
+    if tot_threshold is not None:
+        tpx_data = apply_single_filter(tpx_data, tpx_data["tot"] >= tot_threshold)
+    return tpx_data
+def post_clustering_filter(tpx_data):
+    """
+    Collection of filters directly applied after clustering.
+    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
+    Parameters
+    ----------
+    tpx_data:    dictionary with timepix data, all arrays behind each key must be of same length, now with key labels
+    Returns
+    -------
+    tpx_data: like input tpx_data but with applied filters
+    """
+    if tpx_data["labels"] is not None:
+        tpx_data = apply_single_filter(tpx_data, tpx_data["labels"] != 0)
+    return tpx_data
+def clustering(tpx_data, epsilon=2, tof_scale=1e7, min_samples=3, n_jobs=1):
+    """
+    Parameters
+    ----------
+    tpx_data
+    epsilon
+    tof_scale
+    min_samples
+    n_jobs
+    Returns
+    -------
+    """
+    coords = np.column_stack((tpx_data["x"], tpx_data["y"], tpx_data["tof"]*tof_scale))
+    dist = DBSCAN(eps=epsilon, min_samples=min_samples, metric="euclidean", n_jobs=n_jobs).fit(coords)
+    return dist.labels_ + 1
+def empty_centroid_data():
+    return {
+        "x": np.array([]),
+        "y": np.array([]),
+        "tof": np.array([]),
+        "tot_avg": np.array([]),
+        "tot_max": np.array([]),
+        "size": np.array([]),
+    }
+def get_centroids(tpx_data, timewalk_lut=None):
+    centroid_data = empty_centroid_data()
+    cluster_labels, cluster_size = np.unique(tpx_data["labels"], return_counts=True)
+    cluster_tot_peaks = np.array(nd.maximum_position(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)).ravel()
+    cluster_tot_integrals = nd.sum(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)
+    # compute centroid center through weighted average
+    centroid_data["x"] = np.array(nd.sum(tpx_data["x"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
+    centroid_data["y"] = np.array(nd.sum(tpx_data["y"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
+    centroid_data["tof"] = np.array(nd.sum(tpx_data["tof"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
+    # intensity & size information
+    centroid_data["tot_avg"] = np.array(nd.mean(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels))
+    centroid_data["tot_max"] = tpx_data["tot"][cluster_tot_peaks]
+    centroid_data["tot"] = np.array(cluster_tot_integrals)
+    centroid_data["size"] = cluster_size
+    # train ID information
+    # ~ centroid_data["tid"] = tpx_data["tid"][cluster_tot_peaks]
+    # correct for timewalk if provided
+    if timewalk_lut is not None:
+        centroid_data["tof"] -= timewalk_lut[np.int_(centroid_data["tot_max"] // 25) - 1] * 1e3
+    return centroid_data
+def compute_centroids(x, y, tof, tot,
+                      threshold_tot=None,
+                      clustering_epsilon=2,
+                      clustering_tof_scale=1e7,
+                      clustering_min_samples=3,
+                      clustering_n_jobs=1,
+                      centroiding_timewalk_lut=None):
+    # format input data
+    _tpx_data = {
+        "x": x,
+        "y": y,
+        "tof": tof,
+        "tot": tot
+    }
+    # ensure that valid data is available
+    data_validation = check_data(_tpx_data)
+    if data_validation < 0:
+        if data_validation in error_msgs.keys():
+            print("Data validation failed with message: %s" % error_msgs[data_validation])
+        else:
+            print("Data validation failed: unknown reason")
+        return None
+    # clustering (identify clusters in 2d data (x,y,tof) that belong to a single hit,
+    # each sample belonging to a cluster is labeled with an integer cluster id no)
+    _tpx_data = pre_clustering_filter(_tpx_data, tot_threshold=threshold_tot)
+    _tpx_data["labels"] = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples, n_jobs=clustering_n_jobs)
+    _tpx_data = post_clustering_filter(_tpx_data)
+    # compute centroid data (reduce cluster of samples to a single point with properties)
+    if _tpx_data["labels"] is None or _tpx_data["labels"].size == 0:
+        # handle case of no identified clusters, return empty dictionary with expected keys
+        return empty_centroid_data()
+    _centroids = get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)
+    return _centroids
+```
+%% Cell type:markdown id:4f54e558-d862-48af-86b9-7d9e3d53fd3d tags:
+## Output Function (for illustration) - instead data should be written to output file train-id wise
+%% Cell type:code id:83acf826-e351-4a39-a870-6c1e0298e711 tags:
+``` python
+def output(train_id, centroids, fixed_len=None):
+    """
+    In this version the resulting vectors within a run vary in length,
+    if fixed len is provided fixed length vectors are generated
+    """
+    if fixed_len is None:
+        out = {
+            "%s[data.trainId]"%event_output_key : train_id,
+            "%s[data.x]"%event_output_key : centroids["x"],
+            "%s[data.y]"%event_output_key : centroids["y"],
+            "%s[data.toa]"%event_output_key : centroids["tof"],
+            "%s[data.tot_avg]"%event_output_key : centroids["tot_avg"],
+            "%s[data.tot_max]"%event_output_key : centroids["tot_max"],
+            "%s[data.size]"%event_output_key : centroids["size"],
+        }
+    else:
+        if len(centroids["x"])>fixed_len:
+            sel = np.s_[:fixed_len]
+            out = {
+                "%s[data.trainId]"%event_output_key : train_id,
+                "%s[data.x]"%event_output_key : centroids["x"][sel],
+                "%s[data.y]"%event_output_key : centroids["y"][sel],
+                "%s[data.toa]"%event_output_key : centroids["tof"][sel],
+                "%s[data.tot_avg]"%event_output_key : centroids["tot_avg"][sel],
+                "%s[data.tot_max]"%event_output_key : centroids["tot_max"][sel],
+                "%s[data.centroid_size]"%event_output_key : centroids["size"][sel],
+                "%s[data.size]"%event_output_key : len(centroids["x"]),
+            }
+        else:
+            pad = fixed_len - len(centroids["x"])
+            out = {
+                "%s[data.trainId]"%event_output_key : train_id, # int 0d
+                "%s[data.x]"%event_output_key : np.pad(centroids["x"], (0,pad), constant_values=1),  # float 1d
+                "%s[data.y]"%event_output_key : np.pad(centroids["y"], (0,pad), constant_values=1),  # float 1d
+                "%s[data.toa]"%event_output_key : np.pad(centroids["tof"], (0,pad), constant_values=1),  # float 1d
+                "%s[data.tot_avg]"%event_output_key : np.pad(centroids["tot_avg"], (0,pad), constant_values=1),  # float 1d
+                "%s[data.tot_max]"%event_output_key : np.pad(centroids["tot_max"], (0,pad), constant_values=1),  # float 1d
+                "%s[data.centroid_size]"%event_output_key : np.pad(centroids["size"], (0,pad), constant_values=1),  # int 1d
+                "%s[data.size]"%event_output_key : len(centroids["x"]),  # int 0d
+            }
+#     print(out)
+```
+%% Cell type:markdown id:a39ce0e4-31b3-40c7-9c80-72b89d037714 tags:
+## Settings
+%% Cell type:markdown id:8318bc3f-01be-4144-a4de-ebd736563396 tags:
+### Data selection (for dev)
+%% Cell type:code id:693c72cd-9c67-46d6-a0bc-438126e62642 tags:
+``` python
+# Data selection parameters.
+run_id = 420
+proposal = 900256
+```
+%% Cell type:markdown id:13956e56-d818-42e4-aa73-7f9c4a897732 tags:
+### Parameters to be exposed with calibration pipeline
+%% Cell type:code id:a9adf469-32de-43b4-91fa-2508a952d0dc tags:
+``` python
+# Centroiding and output parameters that should be easily adjustable
+max_n_centroids = 10000
+clustering_epsilon = 2
+clustering_tof_scale = 1e7
+clustering_min_samples = 3
+clustering_n_jobs = 1
+threshold_tot = None
+raw_timewalk_lut_filepath = None  # fpath to look up table for timewalk correction or None
+centroiding_timewalk_lut_filepath = None  # fpath to look up table for timewalk correction or None
+# raw_timewalk_lut_filepath = "timewalk_raw.npy"  # fpath to look up table for timewalk correction or None
+# centroiding_timewalk_lut_filepath = "timewalk_cent.npy"  # fpath to look up table for timewalk correction or None
+```
+%% Cell type:markdown id:c6dff850-18c8-4d5a-9362-e581cce18d8e tags:
+## Centroiding
+%% Cell type:code id:56306c15-513e-4e7f-9c47-c52ca61b27a8 tags:
+``` python
+run = open_run(run=run_id, proposal=proposal)
+event_output_key = 'SQS_AQS_CAM/CAM/TIMEPIX3:daqEventOutput'
+if raw_timewalk_lut_filepath is not None:
+    raw_timewalk_lut = np.load(raw_timewalk_lut_filepath)
+else:
+    raw_timewalk_lut = None
+if centroiding_timewalk_lut_filepath is not None:
+    centroiding_timewalk_lut = np.load(centroiding_timewalk_lut_filepath)
+else:
+    centroiding_timewalk_lut = None
+```
+%% Cell type:code id:8c28a4c8-961c-496b-80da-7fd867e5b0d3 tags:
+``` python
+t00 = time()
+for train_id, data in run.select(event_output_key, "data.*").trains():
+    t0 = time()
+    sel = np.s_[:data[event_output_key]["data.size"]]
+    tot = np.array(data[event_output_key]["data.tot"][sel])
+    toa = np.array(data[event_output_key]["data.toa"][sel])
+    if raw_timewalk_lut is not None:
+        toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3
+    centroids = compute_centroids(data[event_output_key]["data.x"][sel],
+                                  data[event_output_key]["data.y"][sel],
+                                  toa,
+                                  tot,
+                                  threshold_tot=threshold_tot,
+                                  clustering_epsilon=clustering_epsilon,
+                                  clustering_tof_scale=clustering_tof_scale,
+                                  clustering_min_samples=clustering_min_samples,
+                                  clustering_n_jobs=clustering_n_jobs,
+                                  centroiding_timewalk_lut=centroiding_timewalk_lut)
+#     print(time()-t0)
+    output(train_id, centroids, fixed_len=max_n_centroids)
+#     if time()-t00 > 0:
+#         break
+print(time()-t00)
+```