From ca7b2df90031b804ae73325c2440fdb8a645e5fa Mon Sep 17 00:00:00 2001
From: Mads Jakobsen <mads.jakobsen@xfel.eu>
Date: Thu, 25 Apr 2024 14:34:46 +0200
Subject: [PATCH] added myxpcs test package

---
 myxpcs/include/data.h                         |  185 ++
 myxpcs/include/set_integer.h                  |    1 +
 .../include/taskflow_/algorithm/critical.hpp  |   78 +
 .../taskflow_/algorithm/data_pipeline.hpp     |  637 +++++
 myxpcs/include/taskflow_/algorithm/find.hpp   |  551 ++++
 .../include/taskflow_/algorithm/for_each.hpp  |  171 ++
 myxpcs/include/taskflow_/algorithm/launch.hpp |   58 +
 .../taskflow_/algorithm/partitioner.hpp       |  543 ++++
 .../include/taskflow_/algorithm/pipeline.hpp  | 1663 ++++++++++++
 myxpcs/include/taskflow_/algorithm/reduce.hpp |  443 +++
 myxpcs/include/taskflow_/algorithm/scan.hpp   |  617 +++++
 myxpcs/include/taskflow_/algorithm/sort.hpp   |  661 +++++
 .../include/taskflow_/algorithm/transform.hpp |  199 ++
 myxpcs/include/taskflow_/core/async.hpp       |  330 +++
 myxpcs/include/taskflow_/core/async_task.hpp  |  209 ++
 .../include/taskflow_/core/declarations.hpp   |   60 +
 myxpcs/include/taskflow_/core/environment.hpp |    8 +
 myxpcs/include/taskflow_/core/error.hpp       |   26 +
 .../taskflow_/core/executor-module-opt.hpp    | 2025 ++++++++++++++
 myxpcs/include/taskflow_/core/executor.hpp    | 2385 +++++++++++++++++
 .../include/taskflow_/core/flow_builder.hpp   | 1399 ++++++++++
 myxpcs/include/taskflow_/core/graph.hpp       | 1017 +++++++
 myxpcs/include/taskflow_/core/notifier.hpp    |  295 ++
 myxpcs/include/taskflow_/core/observer.hpp    | 1046 ++++++++
 myxpcs/include/taskflow_/core/semaphore.hpp   |  132 +
 myxpcs/include/taskflow_/core/task.hpp        |  776 ++++++
 myxpcs/include/taskflow_/core/taskflow.hpp    |  643 +++++
 myxpcs/include/taskflow_/core/topology.hpp    |   62 +
 myxpcs/include/taskflow_/core/tsq.hpp         |  441 +++
 myxpcs/include/taskflow_/core/worker.hpp      |  172 ++
 .../include/taskflow_/cuda/algorithm/find.hpp |  294 ++
 .../taskflow_/cuda/algorithm/for_each.hpp     |  315 +++
 .../taskflow_/cuda/algorithm/matmul.hpp       |   57 +
 .../taskflow_/cuda/algorithm/merge.hpp        |  585 ++++
 .../taskflow_/cuda/algorithm/reduce.hpp       |  460 ++++
 .../include/taskflow_/cuda/algorithm/scan.hpp |  488 ++++
 .../include/taskflow_/cuda/algorithm/sort.hpp |  506 ++++
 .../taskflow_/cuda/algorithm/transform.hpp    |  282 ++
 .../taskflow_/cuda/algorithm/transpose.hpp    |   41 +
 .../include/taskflow_/cuda/cuda_capturer.hpp  |  724 +++++
 myxpcs/include/taskflow_/cuda/cuda_device.hpp |  342 +++
 myxpcs/include/taskflow_/cuda/cuda_error.hpp  |   26 +
 .../taskflow_/cuda/cuda_execution_policy.hpp  |  155 ++
 myxpcs/include/taskflow_/cuda/cuda_graph.hpp  |  805 ++++++
 myxpcs/include/taskflow_/cuda/cuda_memory.hpp |  855 ++++++
 myxpcs/include/taskflow_/cuda/cuda_meta.hpp   |  452 ++++
 myxpcs/include/taskflow_/cuda/cuda_object.hpp |  287 ++
 .../include/taskflow_/cuda/cuda_optimizer.hpp |  404 +++
 myxpcs/include/taskflow_/cuda/cuda_stream.hpp |  226 ++
 myxpcs/include/taskflow_/cuda/cuda_task.hpp   |  274 ++
 myxpcs/include/taskflow_/cuda/cudaflow.hpp    | 1024 +++++++
 myxpcs/include/taskflow_/dsl/connection.hpp   |   53 +
 myxpcs/include/taskflow_/dsl/dsl.hpp          |   13 +
 myxpcs/include/taskflow_/dsl/meta_macro.hpp   |   72 +
 .../include/taskflow_/dsl/task_analyzer.hpp   |   40 +
 myxpcs/include/taskflow_/dsl/task_dsl.hpp     |  104 +
 myxpcs/include/taskflow_/dsl/task_trait.hpp   |   46 +
 myxpcs/include/taskflow_/dsl/tuple_utils.hpp  |   43 +
 myxpcs/include/taskflow_/dsl/type_list.hpp    |  136 +
 .../taskflow_/sycl/algorithm/reduce.hpp       |  487 ++++
 .../sycl/algorithm/sycl_for_each.hpp          |   88 +
 .../sycl/algorithm/sycl_transform.hpp         |   46 +
 .../taskflow_/sycl/sycl_execution_policy.hpp  |   70 +
 myxpcs/include/taskflow_/sycl/sycl_graph.hpp  |  255 ++
 myxpcs/include/taskflow_/sycl/sycl_meta.hpp   |  517 ++++
 myxpcs/include/taskflow_/sycl/sycl_task.hpp   |  209 ++
 myxpcs/include/taskflow_/sycl/syclflow.hpp    |  684 +++++
 myxpcs/include/taskflow_/taskflow.hpp         |   69 +
 myxpcs/include/taskflow_/utility/iterator.hpp |   22 +
 myxpcs/include/taskflow_/utility/macros.hpp   |   17 +
 myxpcs/include/taskflow_/utility/math.hpp     |  151 ++
 .../include/taskflow_/utility/object_pool.hpp |  778 ++++++
 myxpcs/include/taskflow_/utility/os.hpp       |  196 ++
 .../include/taskflow_/utility/serializer.hpp  | 1135 ++++++++
 .../include/taskflow_/utility/singleton.hpp   |   33 +
 .../taskflow_/utility/small_vector.hpp        | 1048 ++++++++
 myxpcs/include/taskflow_/utility/stream.hpp   |   32 +
 myxpcs/include/taskflow_/utility/traits.hpp   |  303 +++
 myxpcs/include/taskflow_/utility/uuid.hpp     |  235 ++
 myxpcs/source/function_call.pyx               |   69 +
 myxpcs/source/set_integer.cpp                 |   30 +
 81 files changed, 32416 insertions(+)
 create mode 100644 myxpcs/include/data.h
 create mode 100644 myxpcs/include/set_integer.h
 create mode 100644 myxpcs/include/taskflow_/algorithm/critical.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/data_pipeline.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/find.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/for_each.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/launch.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/partitioner.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/pipeline.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/reduce.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/scan.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/sort.hpp
 create mode 100644 myxpcs/include/taskflow_/algorithm/transform.hpp
 create mode 100644 myxpcs/include/taskflow_/core/async.hpp
 create mode 100644 myxpcs/include/taskflow_/core/async_task.hpp
 create mode 100644 myxpcs/include/taskflow_/core/declarations.hpp
 create mode 100644 myxpcs/include/taskflow_/core/environment.hpp
 create mode 100644 myxpcs/include/taskflow_/core/error.hpp
 create mode 100644 myxpcs/include/taskflow_/core/executor-module-opt.hpp
 create mode 100644 myxpcs/include/taskflow_/core/executor.hpp
 create mode 100644 myxpcs/include/taskflow_/core/flow_builder.hpp
 create mode 100644 myxpcs/include/taskflow_/core/graph.hpp
 create mode 100644 myxpcs/include/taskflow_/core/notifier.hpp
 create mode 100644 myxpcs/include/taskflow_/core/observer.hpp
 create mode 100644 myxpcs/include/taskflow_/core/semaphore.hpp
 create mode 100644 myxpcs/include/taskflow_/core/task.hpp
 create mode 100644 myxpcs/include/taskflow_/core/taskflow.hpp
 create mode 100644 myxpcs/include/taskflow_/core/topology.hpp
 create mode 100644 myxpcs/include/taskflow_/core/tsq.hpp
 create mode 100644 myxpcs/include/taskflow_/core/worker.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/find.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/merge.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/scan.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/sort.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/transform.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_capturer.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_device.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_error.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_graph.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_memory.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_meta.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_object.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_stream.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cuda_task.hpp
 create mode 100644 myxpcs/include/taskflow_/cuda/cudaflow.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/connection.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/dsl.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/meta_macro.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/task_analyzer.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/task_dsl.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/task_trait.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/tuple_utils.hpp
 create mode 100644 myxpcs/include/taskflow_/dsl/type_list.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/sycl_graph.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/sycl_meta.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/sycl_task.hpp
 create mode 100644 myxpcs/include/taskflow_/sycl/syclflow.hpp
 create mode 100644 myxpcs/include/taskflow_/taskflow.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/iterator.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/macros.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/math.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/object_pool.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/os.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/serializer.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/singleton.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/small_vector.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/stream.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/traits.hpp
 create mode 100644 myxpcs/include/taskflow_/utility/uuid.hpp
 create mode 100644 myxpcs/source/function_call.pyx
 create mode 100644 myxpcs/source/set_integer.cpp

diff --git a/myxpcs/include/data.h b/myxpcs/include/data.h
new file mode 100644
index 0000000..d85392b
--- /dev/null
+++ b/myxpcs/include/data.h
@@ -0,0 +1,185 @@
+#include <vector>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <iostream>
+
+#include <taskflow_/taskflow.hpp>
+#include <taskflow_/algorithm/for_each.hpp>
+
+
+template <typename T>
+struct Storage
+{
+    std::vector<std::size_t> shape{};
+    T *ptr{nullptr};
+
+    Storage(const std::vector<std::size_t> &shape)
+        : shape{shape}
+    {
+        std::size_t numElements = 1;
+        for (auto element : shape)
+        {
+            numElements *= element;
+        }
+
+        ptr = static_cast<T *>(std::aligned_alloc(sysconf(_SC_PAGESIZE), sizeof(T) * numElements));
+    }
+
+    ~Storage()
+    {
+        // std::cout << "storage freed";
+        free(ptr);
+    }
+
+    void printStats()
+    {
+        std::cout << "Storage Stats: " << getSize() << "\n";
+        std::cout << "dim: " << shape.size() << " : ";
+        for (auto len : shape)
+        {
+            std::cout << len << ", ";
+        }
+        std::cout << std::endl;
+    }
+
+    std::int32_t getSize() const
+    {
+        std::int32_t size = 1;
+        for (auto len : shape)
+        {
+            size *= len;
+        }
+        return size;
+    }
+};
+
+template <typename T>
+using Memory = std::shared_ptr<Storage<T>>;
+
+template <typename T>
+Memory<T> TranposeFromImageToTime_v3_block_tf_no_struct_one_taskflow(const Memory<T> in, std::size_t fastBlockSizeDim, std::size_t slowBlockSizeDim, tf::Executor &executor)
+{
+    std::cout << "bluib";
+    // CDCS::Utility::ScopedConsoleMicrosecondTimer timer("transposing data block   fast_write   tf  one taskflow< " + std::to_string(fastBlockSizeDim) + " , " + std::to_string(slowBlockSizeDim) + " >");
+
+    const auto dims = (*in).shape;
+    const std::size_t X = dims[0];
+    const std::size_t Y = dims[1];
+    const std::size_t Z = dims[2];
+
+    auto out = std::make_shared<Storage<T>>(std::vector<std::size_t>{Z, X, Y});
+
+    const std::size_t imagesize = X * Y;
+
+    const std::size_t fastDim = imagesize;
+    const std::size_t slowDim = Z;
+
+    T *in_ptr = in->ptr;
+    T *out_ptr = out->ptr;
+
+    // add regular patches
+    std::size_t fastBlockPos = 0;
+    std::size_t slowBlockPos = 0;
+
+    tf::Taskflow taskflow;
+
+    if (slowDim >= slowBlockSizeDim && fastDim >= fastBlockSizeDim)
+    {
+        while (slowBlockPos + slowBlockSizeDim <= slowDim)
+        {
+
+            fastBlockPos = 0;
+            while (fastBlockPos + fastBlockSizeDim <= fastDim)
+            {
+
+                taskflow.emplace(
+                    [in_ptr, out_ptr, fastBlockSizeDim, slowBlockSizeDim, fastBlockPos, slowBlockPos, fastDim, slowDim]()
+                    {
+                        for (std::size_t fast = 0; fast < fastBlockSizeDim; fast++)
+                        {
+                            for (std::size_t slow = 0; slow < slowBlockSizeDim; slow++)
+                            {
+                                out_ptr[slowBlockPos + fastBlockPos * slowDim + slow + fast * slowDim] = in_ptr[fastBlockPos + slowBlockPos * fastDim + fast + slow * fastDim];
+                            }
+                        }
+                    });
+                fastBlockPos += fastBlockSizeDim;
+            }
+
+            slowBlockPos += slowBlockSizeDim;
+        }
+    }
+
+    std::size_t fastEnd = fastBlockPos;
+    std::size_t slowEnd = slowBlockPos;
+
+    std::size_t fastLeftover = fastDim - fastEnd;
+    std::size_t slowLeftover = slowDim - slowEnd;
+
+    // check for leftovers
+    if (fastLeftover != 0 && slowDim >= slowBlockSizeDim)
+    {
+        slowBlockPos = 0;
+
+        while (slowBlockPos + slowBlockSizeDim <= slowDim)
+        {
+            taskflow.emplace(
+                [in_ptr, out_ptr, fastLeftover, slowBlockSizeDim, fastEnd, slowBlockPos, fastDim, slowDim]()
+                {
+                    for (std::size_t fast = 0; fast < fastLeftover; fast++)
+                    {
+                        for (std::size_t slow = 0; slow < slowBlockSizeDim; slow++)
+                        {
+                            out_ptr[slowBlockPos + fastEnd * slowDim + slow + fast * slowDim] = in_ptr[fastEnd + slowBlockPos * fastDim + fast + slow * fastDim];
+                        }
+                    }
+                });
+            slowBlockPos += slowBlockSizeDim;
+        }
+
+        slowBlockPos += slowBlockSizeDim;
+    }
+
+    // check for leftovers
+    if (slowLeftover != 0 && fastDim >= fastBlockSizeDim)
+    {
+        fastBlockPos = 0;
+
+        while (fastBlockPos + fastBlockSizeDim < fastDim)
+        {
+            taskflow.emplace(
+                [in_ptr, out_ptr, fastBlockSizeDim, slowLeftover, fastBlockPos, slowEnd, fastDim, slowDim]()
+                {
+                    for (std::size_t fast = 0; fast < fastBlockSizeDim; fast++)
+                    {
+                        for (std::size_t slow = 0; slow < slowLeftover; slow++)
+                        {
+                            out_ptr[slowEnd + fastBlockPos * slowDim + slow + fast * slowDim] = in_ptr[fastBlockPos + slowEnd * fastDim + fast + slow * fastDim];
+                        }
+                    }
+                });
+            fastBlockPos += fastBlockSizeDim;
+        }
+
+        fastBlockPos += fastBlockSizeDim;
+    }
+
+    if (slowLeftover != 0 && fastLeftover != 0)
+    {
+
+        taskflow.emplace(
+            [in_ptr, out_ptr, fastLeftover, slowLeftover, fastEnd, slowEnd, fastDim, slowDim]()
+            {
+                for (std::size_t fast = 0; fast < fastLeftover; fast++)
+                {
+                    for (std::size_t slow = 0; slow < slowLeftover; slow++)
+                    {
+                        out_ptr[slowEnd + fastEnd * slowDim + slow + fast * slowDim] = in_ptr[fastEnd + slowEnd * fastDim + fast + slow * fastDim];
+                    }
+                }
+            });
+    }
+    executor.run(taskflow).wait();
+    return out;
+}
diff --git a/myxpcs/include/set_integer.h b/myxpcs/include/set_integer.h
new file mode 100644
index 0000000..f73f460
--- /dev/null
+++ b/myxpcs/include/set_integer.h
@@ -0,0 +1 @@
+void computeXPCS(float* in, float* out);
\ No newline at end of file
diff --git a/myxpcs/include/taskflow_/algorithm/critical.hpp b/myxpcs/include/taskflow_/algorithm/critical.hpp
new file mode 100644
index 0000000..c781d28
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/critical.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "../core/task.hpp"
+
+/**
+@file critical.hpp
+@brief critical include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// CriticalSection
+// ----------------------------------------------------------------------------
+
+/**
+@class CriticalSection
+
+@brief class to create a critical region of limited workers to run tasks
+
+tf::CriticalSection is a warpper over tf::Semaphore and is specialized for
+limiting the maximum concurrency over a set of tasks.
+A critical section starts with an initial count representing that limit.
+When a task is added to the critical section,
+the task acquires and releases the semaphore internal to the critical section.
+This design avoids explicit call of tf::Task::acquire and tf::Task::release.
+The following example creates a critical section of one worker and adds
+the five tasks to the critical section.
+
+@code{.cpp}
+tf::Executor executor(8);   // create an executor of 8 workers
+tf::Taskflow taskflow;
+
+// create a critical section of 1 worker
+tf::CriticalSection critical_section(1);
+
+tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; });
+tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; });
+tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; });
+tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; });
+tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; });
+
+critical_section.add(A, B, C, D, E);
+
+executor.run(taskflow).wait();
+@endcode
+
+*/
+class CriticalSection : public Semaphore {
+
+  public:
+
+    /**
+    @brief constructs a critical region of a limited number of workers
+    */
+    explicit CriticalSection(size_t max_workers = 1);
+
+    /**
+    @brief adds a task into the critical region
+    */
+    template <typename... Tasks>
+    void add(Tasks...tasks);
+};
+
+inline CriticalSection::CriticalSection(size_t max_workers) :
+  Semaphore {max_workers} {
+}
+
+template <typename... Tasks>
+void CriticalSection::add(Tasks... tasks) {
+  (tasks.acquire(*this), ...);
+  (tasks.release(*this), ...);
+}
+
+
+}  // end of namespace tf. ---------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp b/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp
new file mode 100644
index 0000000..0393548
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp
@@ -0,0 +1,637 @@
+#pragma once
+
+#include "pipeline.hpp"
+
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Class Definition: DataPipe
+// ----------------------------------------------------------------------------
+
+/**
+@class DataPipe
+
+@brief class to create a stage in a data-parallel pipeline 
+
+A data pipe represents a stage of a data-parallel pipeline. 
+A data pipe can be either @em parallel direction or @em serial direction 
+(specified by tf::PipeType) and is associated with a callable to invoke 
+by the pipeline scheduler.
+
+You need to use the template function, tf::make_data_pipe, to create 
+a data pipe. The input and output types of a tf::DataPipe should be decayed types 
+(though the library will always decay them for you using `std::decay`)
+to allow internal storage to work.
+The data will be passed by reference to your callable, at which you can take 
+it by copy or reference.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input) {return std::to_string(input + 100);}
+);
+@endcode
+
+In addition to the data, you callable can take an additional reference 
+of tf::Pipeflow in the second argument to probe the runtime information
+for a stage task, such as its line number and token number:
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input, tf::Pipeflow& pf) {
+    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+*/
+template <typename Input, typename Output, typename C>
+class DataPipe {
+
+  template <typename... Ps>
+  friend class DataPipeline;
+
+  public:
+
+  /**
+  @brief callable type of the data pipe
+  */
+  using callable_t = C;
+
+  /**
+  @brief input type of the data pipe
+  */
+  using input_t = Input;
+
+  /**
+  @brief output type of the data pipe
+  */
+  using output_t = Output;
+
+  /**
+  @brief default constructor
+  */
+  DataPipe() = default;
+
+  /**
+  @brief constructs a data pipe
+
+  You should use the helper function, tf::make_data_pipe, 
+  to create a DataPipe object, especially when you need tf::DataPipe
+  to automatically deduct the lambda type.
+  */
+  DataPipe(PipeType d, callable_t&& callable) :
+    _type{d}, _callable{std::forward<callable_t>(callable)} {
+  }
+
+  /**
+  @brief queries the type of the data pipe
+
+  A data pipe can be either parallel (tf::PipeType::PARALLEL) or serial
+  (tf::PipeType::SERIAL).
+  */
+  PipeType type() const {
+    return _type;
+  }
+
+  /**
+  @brief assigns a new type to the data pipe
+  */
+  void type(PipeType type) {
+    _type = type;
+  }
+
+  /**
+  @brief assigns a new callable to the data pipe
+
+  @tparam U callable type
+  @param callable a callable object constructible from the callable type
+                  of this data pipe
+
+  Assigns a new callable to the pipe using universal forwarding.
+  */
+  template <typename U>
+  void callable(U&& callable) {
+    _callable = std::forward<U>(callable);
+  }
+
+  private:
+
+  PipeType _type;
+
+  callable_t _callable;
+};
+
+/**
+@brief function to construct a data pipe (tf::DataPipe)
+
+@tparam Input input data type
+@tparam Output output data type
+@tparam C callable type
+
+tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe)
+in a data-parallel pipeline (tf::DataPipeline).
+The first argument specifies the direction of the data pipe,
+either tf::PipeType::SERIAL or tf::PipeType::PARALLEL,
+and the second argument is a callable to invoke by the pipeline scheduler.
+Input and output data types are specified via template parameters,
+which will always be decayed by the library to its original form
+for storage purpose.
+The callable must take the input data type in its first argument
+and returns a value of the output data type.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input) {
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+The callable can additionally take a reference of tf::Pipeflow, 
+which allows you to query the runtime information of a stage task,
+such as its line number and token number.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input, tf::Pipeflow& pf) {
+    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+*/
+template <typename Input, typename Output, typename C>
+auto make_data_pipe(PipeType d, C&& callable) {
+  return DataPipe<Input, Output, C>(d, std::forward<C>(callable));
+}
+
+// ----------------------------------------------------------------------------
+// Class Definition: DataPipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class DataPipeline
+
+@brief class to create a data-parallel pipeline scheduling framework
+
+@tparam Ps data pipe types
+
+Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object
+for users to create a <i>data-parallel pipeline scheduling framework</i> 
+using a module task in a taskflow.
+The only difference is that tf::DataPipeline provides a data abstraction
+for users to quickly express dataflow in a pipeline.
+The following example creates a data-parallel pipeline of three stages
+that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`.
+
+@code{.cpp}
+#include <taskflow/taskflow.hpp>
+#include <taskflow/algorithm/data_pipeline.hpp>
+
+int main() {
+
+  // data flow => void -> int -> std::string -> float -> void 
+  tf::Taskflow taskflow("pipeline");
+  tf::Executor executor;
+
+  const size_t num_lines = 4;
+
+  tf::DataPipeline pl(num_lines,
+    tf::make_data_pipe<void, int>(tf::PipeType::SERIAL, [&](tf::Pipeflow& pf) -> int{
+      if(pf.token() == 5) {
+        pf.stop();
+        return 0;
+      }
+      else {
+        return pf.token();
+      }
+    }),
+    tf::make_data_pipe<int, std::string>(tf::PipeType::SERIAL, [](int& input) {
+      return std::to_string(input + 100);
+    }),
+    tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL, [](std::string& input) {
+      std::cout << input << std::endl;
+    })
+  );
+
+  // build the pipeline graph using composition
+  taskflow.composed_of(pl).name("pipeline");
+
+  // dump the pipeline graph structure (with composition)
+  taskflow.dump(std::cout);
+
+  // run the pipeline
+  executor.run(taskflow).wait();
+
+  return 0;
+}
+@endcode
+
+The pipeline schedules five tokens over four parallel lines in a circular fashion, 
+as depicted below:
+
+@code{.shell-session}
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+@endcode
+*/
+template <typename... Ps>
+class DataPipeline {
+
+  static_assert(sizeof...(Ps)>0, "must have at least one pipe");
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  /**
+  @private
+  */
+  struct PipeMeta {
+    PipeType type;
+  };
+
+
+  public:
+  
+  /**
+  @brief internal storage type for each data token (default std::variant)
+  */
+  using data_t = unique_variant_t<std::variant<std::conditional_t<
+    std::is_void_v<typename Ps::output_t>, 
+    std::monostate, 
+    std::decay_t<typename Ps::output_t>>...
+  >>;
+
+  /**
+  @brief constructs a data-parallel pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a list of pipes
+
+  Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  DataPipeline(size_t num_lines, Ps&&... ps);
+
+  /**
+  @brief constructs a data-parallel pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a tuple of pipes
+
+  Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes stored in a std::tuple.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  constexpr size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resetting the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero as if the pipeline was just
+  constructed.
+  */
+  void reset();
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of this pipeline.
+  */
+  Graph& graph();
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens;
+
+  std::tuple<Ps...> _pipes;
+  std::array<PipeMeta, sizeof...(Ps)> _meta;
+  std::vector<std::array<Line, sizeof...(Ps)>> _lines;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  std::vector<CachelineAligned<data_t>> _buffer;
+
+  template <size_t... I>
+  auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>);
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+};
+
+// constructor
+template <typename... Ps>
+DataPipeline<Ps...>::DataPipeline(size_t num_lines, Ps&&... ps) :
+  _pipes     {std::make_tuple(std::forward<Ps>(ps)...)},
+  _meta      {PipeMeta{ps.type()}...},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines),
+  _buffer    (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// constructor
+template <typename... Ps>
+DataPipeline<Ps...>::DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps) :
+  _pipes     {std::forward<std::tuple<Ps...>>(ps)},
+  _meta      {_gen_meta(
+    std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{}
+  )},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines),
+  _buffer    (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// Function: _get_meta
+template <typename... Ps>
+template <size_t... I>
+auto DataPipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) {
+  return std::array{PipeMeta{std::get<I>(ps).type()}...};
+}
+
+// Function: num_lines
+template <typename... Ps>
+size_t DataPipeline<Ps...>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename... Ps>
+constexpr size_t DataPipeline<Ps...>::num_pipes() const noexcept {
+  return sizeof...(Ps);
+}
+
+// Function: num_tokens
+template <typename... Ps>
+size_t DataPipeline<Ps...>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename... Ps>
+Graph& DataPipeline<Ps...>::graph() {
+  return _graph;
+}
+
+// Function: reset
+template <typename... Ps>
+void DataPipeline<Ps...>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+  }
+
+  _lines[0][0].join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _lines[l][f].join_counter.store(
+        static_cast<size_t>(_meta[f].type), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _lines[0][f].join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _lines[l][0].join_counter.store(
+      static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed
+    );
+  }
+}
+
+// Procedure: _on_pipe
+template <typename... Ps>
+void DataPipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime&) {
+
+  visit_tuple([&](auto&& pipe){
+
+    using data_pipe_t = std::decay_t<decltype(pipe)>;
+    using callable_t  = typename data_pipe_t::callable_t;
+    using input_t     = std::decay_t<typename data_pipe_t::input_t>;
+    using output_t    = std::decay_t<typename data_pipe_t::output_t>;
+    
+    // first pipe
+    if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+      // [](tf::Pipeflow&) -> void {}, i.e., we only have one pipe
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(pf);
+      // [](tf::Pipeflow&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(pf);
+      }
+    }
+    // other pipes without pipeflow in the second argument
+    else if constexpr (std::is_invocable_v<callable_t, std::add_lvalue_reference_t<input_t> >) {
+      // [](input_t&) -> void {}, i.e., the last pipe
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(std::get<input_t>(_buffer[pf._line].data));
+      // [](input_t&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(
+          std::get<input_t>(_buffer[pf._line].data)
+        );
+      }
+    }
+    // other pipes with pipeflow in the second argument
+    else if constexpr (std::is_invocable_v<callable_t, input_t&, Pipeflow&>) {
+      // [](input_t&, tf::Pipeflow&) -> void {}
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(std::get<input_t>(_buffer[pf._line].data), pf);
+      // [](input_t&, tf::Pipeflow&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(
+          std::get<input_t>(_buffer[pf._line].data), pf
+        );
+      }
+    }
+    //else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+    //  pipe._callable(pf, rt);
+    //}
+    else {
+      static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+    }
+  }, _pipes, pf._pipe);
+}
+
+// Procedure: _build
+template <typename... Ps>
+void DataPipeline<Ps...>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _lines[pf->_line][pf->_pipe].join_counter.store(
+        static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed
+      );
+
+      if (pf->_pipe == 0) {
+        pf->_token = _num_tokens;
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        ++_num_tokens;
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_meta[c_f].type == PipeType::SERIAL &&
+         _lines[n_l][c_f].join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_lines[pf->_line][n_f].join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/algorithm/find.hpp b/myxpcs/include/taskflow_/algorithm/find.hpp
new file mode 100644
index 0000000..5a52876
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/find.hpp
@@ -0,0 +1,551 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: find_if_loop
+template <typename Iterator, typename Predicate>
+TF_FORCE_INLINE bool find_if_loop(
+  std::atomic<size_t>& offset, 
+  Iterator& beg,
+  size_t& prev_e,
+  size_t  curr_b, 
+  size_t  curr_e,
+  Predicate&& predicate
+) {
+  // early prune
+  if(offset.load(std::memory_order_relaxed) < curr_b) {
+    return true;
+  }
+  std::advance(beg, curr_b - prev_e);
+  for(size_t x = curr_b; x<curr_e; x++) {
+    if(predicate(*beg++)) {
+      atomic_min(offset, x);
+      return true;
+    }
+  }
+  prev_e = curr_e;
+  return false;
+}
+
+// Function: find_if_not_loop
+template <typename Iterator, typename Predicate>
+TF_FORCE_INLINE bool find_if_not_loop(
+  std::atomic<size_t>& offset, 
+  Iterator& beg,
+  size_t& prev_e,
+  size_t  curr_b, 
+  size_t  curr_e,
+  Predicate&& predicate
+) {
+
+  // early prune
+  if(offset.load(std::memory_order_relaxed) < curr_b) {
+    return true;
+  }
+  std::advance(beg, curr_b - prev_e);
+  for(size_t x = curr_b; x<curr_e; x++) {
+    if(!predicate(*beg++)) {
+      atomic_min(offset, x);
+      return true;
+    }
+  }
+  prev_e = curr_e;
+  return false;
+}
+
+}  // namespace detail --------------------------------------------------------
+
+// Function: make_find_if_task
+template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_find_if_task(
+  B first, E last, T& result, UOP predicate, P&& part = P()
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, predicate, &result, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::find_if(beg, end, predicate);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::atomic<size_t> offset(N);
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt,
+          [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] 
+          () mutable {
+            part.loop_until(N, W, curr_b, chunk_size,
+              [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+                return detail::find_if_loop(
+                  offset, beg, prev_e, part_b, part_e, predicate
+                );
+              }
+            ); 
+          }
+        );
+      }
+
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, 
+        [N, W, beg, &predicate, &offset, &next, &part] () mutable {
+          part.loop_until(N, W, next, 
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              return detail::find_if_loop(
+                offset, beg, prev_e, curr_b, curr_e, predicate
+              );
+            }
+          ); 
+        }
+      );
+    }
+
+    // update the result iterator by the offset
+    result = std::next(beg, offset.load(std::memory_order_relaxed));
+  };
+}
+
+// Function: make_find_if_not_task
+template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_find_if_not_task(
+  B first, E last, T& result, UOP predicate, P&& part = P()
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return
+  [b=first, e=last, predicate, &result, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::find_if_not(beg, end, predicate);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::atomic<size_t> offset(N);
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt,
+          [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] () mutable {
+            part.loop_until(N, W, curr_b, chunk_size,
+              [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+                return detail::find_if_not_loop(
+                  offset, beg, prev_e, part_b, part_e, predicate
+                );
+              }
+            ); 
+          }
+        );
+      }
+
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part,
+        [N, W, beg, &predicate, &offset, &next, &part] () mutable {
+          part.loop_until(N, W, next, 
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              return detail::find_if_not_loop(
+                offset, beg, prev_e, curr_b, curr_e, predicate
+              );
+            }
+          ); 
+        }
+      );
+    }
+
+    // update the result iterator by the offset
+    result = std::next(beg, offset.load(std::memory_order_relaxed));
+  };
+}
+
+// Function: make_min_element_task
+template <typename B, typename E, typename T, typename C, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_min_element_task(
+  B first, E last, T& result, C comp, P&& part = P()
+) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, &result, comp, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::min_element(beg, end, comp);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mutex;
+    
+    // initialize the result to the first element
+    result = beg++;
+    N--;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum needs to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt,
+        [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*beg, *result)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T smallest = comp(*beg1, *beg2) ? beg1 : beg2;
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable {
+
+              if(part_b > prev_e) {
+                std::advance(beg, part_b - prev_e);
+              }
+              else {
+                part_b = prev_e;
+              }
+
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                if(comp(*beg, *smallest)) {
+                  smallest = beg;
+                }
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*smallest, *result)) {
+            result = smallest;
+          }
+        });
+      }
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, 
+        [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable {
+          // pre-reduce
+          size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+          if(s0 >= N) {
+            return;
+          }
+
+          std::advance(beg, s0);
+
+          if(N - s0 == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*beg, *result)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+
+          T smallest = comp(*beg1, *beg2) ? beg1 : beg2;
+          
+          // loop reduce
+          part.loop(N, W, next, 
+            [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable {
+              std::advance(beg, part_b - prev_e);
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                if(comp(*beg, *smallest)) {
+                  smallest = beg;
+                }
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*smallest, *result)) {
+            result = smallest;
+          }
+        }
+      );
+    }
+  };
+}
+
+// Function: make_max_element_task
+template <typename B, typename E, typename T, typename C, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_max_element_task(
+  B first, E last, T& result, C comp, P&& part = P()
+) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, &result, comp, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::max_element(beg, end, comp);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mutex;
+    
+    // initialize the result to the first element
+    result = beg++;
+    N--;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum needs to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt,
+        [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*result, *beg)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T largest = comp(*beg1, *beg2) ? beg2 : beg1;
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable {
+
+              if(part_b > prev_e) {
+                std::advance(beg, part_b - prev_e);
+              }
+              else {
+                part_b = prev_e;
+              }
+
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                if(comp(*largest, *beg)) {
+                  largest = beg;
+                }
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*result, *largest)) {
+            result = largest;
+          }
+        });
+      }
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part,
+        [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable {
+          // pre-reduce
+          size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+          if(s0 >= N) {
+            return;
+          }
+
+          std::advance(beg, s0);
+
+          if(N - s0 == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*result, *beg)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+
+          T largest = comp(*beg1, *beg2) ? beg2 : beg1;
+          
+          // loop reduce
+          part.loop(N, W, next, 
+            [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable {
+              std::advance(beg, part_b - prev_e);
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                if(comp(*largest, *beg)) {
+                  largest = beg;
+                }
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*result, *largest)) {
+            result = largest;
+          }
+        }
+      );
+    }
+  };
+}
+
+
+
+// Function: find_if
+template <typename B, typename E, typename T, typename UOP, typename P>
+Task tf::FlowBuilder::find_if(B first, E last, T& result, UOP predicate, P&& part) {
+  return emplace(make_find_if_task(
+    first, last, result, predicate, std::forward<P>(part)
+  ));
+}
+
+// Function: find_if_not
+template <typename B, typename E, typename T, typename UOP, typename P>
+Task tf::FlowBuilder::find_if_not(B first, E last, T& result, UOP predicate, P&& part) {
+  return emplace(make_find_if_not_task(
+    first, last, result, predicate, std::forward<P>(part)
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// min_element
+// ----------------------------------------------------------------------------
+
+// Function: min_element
+template <typename B, typename E, typename T, typename C, typename P>
+Task FlowBuilder::min_element(B first, E last, T& result, C comp, P&& part) {
+  return emplace(make_min_element_task(
+    first, last, result, comp, std::forward<P>(part)
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// max_element
+// ----------------------------------------------------------------------------
+
+// Function: max_element
+template <typename B, typename E, typename T, typename C, typename P>
+Task FlowBuilder::max_element(B first, E last, T& result, C comp, P&& part) {
+  return emplace(make_max_element_task(
+    first, last, result, comp, std::forward<P>(part)
+  ));
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/algorithm/for_each.hpp b/myxpcs/include/taskflow_/algorithm/for_each.hpp
new file mode 100644
index 0000000..10e0a78
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/for_each.hpp
@@ -0,0 +1,171 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+// Function: make_for_each_task
+template <typename B, typename E, typename C, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_for_each_task(B b, E e, C c, P&& part = P()) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return [b, e, c, part=std::forward<P>(part)] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::for_each(beg, end, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+              std::advance(beg, part_b - prev_e);
+              for(size_t x = part_b; x<part_e; x++) {
+                c(*beg++);
+              }
+              prev_e = part_e;
+            }
+          ); 
+        });
+      }
+
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+            std::advance(beg, part_b - prev_e);
+            for(size_t x = part_b; x<part_e; x++) {
+              c(*beg++);
+            }
+            prev_e = part_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// Function: make_for_each_index_task
+template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_for_each_index_task(B b, E e, S s, C c, P&& part = P()) {
+
+  using namespace std::string_literals;
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using S_t = std::decay_t<unwrap_ref_decay_t<S>>;
+
+  return [b, e, s, c, part=std::forward<P>(part)] (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+    S_t inc = s;
+    
+    // nothing to be done if the range is invalid
+    if(is_range_invalid(beg, end, inc)) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = distance(beg, end, inc);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(size_t x=0; x<N; x++, beg+=inc) {
+        c(beg);
+      }
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&](size_t part_b, size_t part_e) {
+              auto idx = static_cast<B_t>(part_b) * inc + beg;
+              for(size_t x=part_b; x<part_e; x++, idx += inc) {
+                c(idx);
+              }
+            }
+          ); 
+        });
+      }
+
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&](size_t part_b, size_t part_e) {
+            auto idx = static_cast<B_t>(part_b) * inc + beg;
+            for(size_t x=part_b; x<part_e; x++, idx += inc) {
+              c(idx);
+            }
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// ----------------------------------------------------------------------------
+// for_each
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename B, typename E, typename C, typename P>
+Task FlowBuilder::for_each(B beg, E end, C c, P&& part) {
+  return emplace(
+    make_for_each_task(beg, end, c, std::forward<P>(part))
+  );
+}
+
+// ----------------------------------------------------------------------------
+// for_each_index
+// ----------------------------------------------------------------------------
+
+// Function: for_each_index
+template <typename B, typename E, typename S, typename C, typename P>
+Task FlowBuilder::for_each_index(B beg, E end, S inc, C c, P&& part) {
+  return emplace(
+    make_for_each_index_task(beg, end, inc, c, std::forward<P>(part))
+  );
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/algorithm/launch.hpp b/myxpcs/include/taskflow_/algorithm/launch.hpp
new file mode 100644
index 0000000..3598fd5
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/launch.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "../core/async.hpp"
+
+namespace tf {
+
+// Function: launch_loop
+template <typename P, typename Loop>
+TF_FORCE_INLINE void launch_loop(
+  size_t N, 
+  size_t W, 
+  Runtime& rt, 
+  std::atomic<size_t>& next, 
+  P&& part, 
+  Loop&& loop
+) {
+
+  //static_assert(std::is_lvalue_reference_v<Loop>, "");
+  
+  using namespace std::string_literals;
+
+  for(size_t w=0; w<W; w++) {
+    auto r = N - next.load(std::memory_order_relaxed);
+    // no more loop work to do - finished by previous async tasks
+    if(!r) {
+      break;
+    }
+    // tail optimization
+    if(r <= part.chunk_size() || w == W-1) {
+      loop();
+      break;
+    }
+    else {
+      rt.silent_async_unchecked("loop-"s + std::to_string(w), loop);
+    }
+  }
+      
+  rt.corun_all();
+}
+
+// Function: launch_loop
+template <typename Loop>
+TF_FORCE_INLINE void launch_loop(
+  size_t W,
+  size_t w,
+  Runtime& rt, 
+  Loop&& loop 
+) {
+  using namespace std::string_literals;
+  if(w == W-1) {
+    loop();
+  }
+  else {
+    rt.silent_async_unchecked("loop-"s + std::to_string(w), loop);
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/algorithm/partitioner.hpp b/myxpcs/include/taskflow_/algorithm/partitioner.hpp
new file mode 100644
index 0000000..4a253fa
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/partitioner.hpp
@@ -0,0 +1,543 @@
+// reference:
+// - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c
+// - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp
+
+#pragma once
+
+/**
+@file partitioner.hpp
+@brief partitioner include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Partitioner Base
+// ----------------------------------------------------------------------------
+
+/**
+@class PartitionerBase
+
+@brief class to derive a partitioner for scheduling parallel algorithms
+
+The class provides base methods to derive a partitioner that can be used
+to schedule parallel iterations (e.g., tf::Taskflow::for_each).
+
+An partitioner defines the scheduling method for running parallel algorithms,
+such tf::Taskflow::for_each, tf::Taskflow::reduce, and so on.
+By default, we provide the following partitioners:
+
++ tf::GuidedPartitioner to enable guided scheduling algorithm of adaptive chunk size
++ tf::DynamicPartitioner to enable dynamic scheduling algorithm of equal chunk size
++ tf::StaticPartitioner to enable static scheduling algorithm of static chunk size
++ tf::RandomPartitioner to enable random scheduling algorithm of random chunk size
+
+Depending on applications, partitioning algorithms can impact the performance
+a lot. 
+For example, if a parallel-iteration workload contains a regular work unit per
+iteration, tf::StaticPartitioner can deliver the best performance.
+On the other hand, if the work unit per iteration is irregular and unbalanced,
+tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner.
+In most situations, tf::GuidedPartitioner can deliver decent performance and
+is thus used as our default partitioner.
+*/
+class PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  PartitionerBase() = default;
+
+  /**
+  @brief construct a partitioner with the given chunk size
+  */
+  explicit PartitionerBase(size_t chunk_size) : _chunk_size {chunk_size} {}
+
+  /**
+  @brief query the chunk size of this partitioner
+  */
+  size_t chunk_size() const { return _chunk_size; }
+  
+  /**
+  @brief update the chunk size of this partitioner
+  */
+  void chunk_size(size_t cz) { _chunk_size = cz; }
+
+  protected:
+  
+  /**
+  @brief chunk size 
+  */
+  size_t _chunk_size{0};
+};
+
+// ----------------------------------------------------------------------------
+// Guided Partitioner
+// ----------------------------------------------------------------------------
+  
+/**
+@class GuidedPartitioner
+
+@brief class to construct a guided partitioner for scheduling parallel algorithms
+
+The size of a partition is proportional to the number of unassigned iterations 
+divided by the number of workers, 
+and the size will gradually decrease to the given chunk size.
+The last partition may be smaller than the chunk size.
+*/
+class GuidedPartitioner : public PartitionerBase {
+
+  public:
+  
+  /**
+  @brief default constructor
+  */
+  GuidedPartitioner() : PartitionerBase{1} {}
+
+  /**
+  @brief construct a guided partitioner with the given chunk size
+  */
+  explicit GuidedPartitioner(size_t sz) : PartitionerBase (sz) {}
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+
+    size_t p1 = 2 * W * (chunk_size + 1);
+    float  p2 = 0.5f / static_cast<float>(W);
+    size_t curr_b = next.load(std::memory_order_relaxed);
+
+    while(curr_b < N) {
+
+      size_t r = N - curr_b;
+
+      // fine-grained
+      if(r < p1) {
+        while(1) {
+          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+          if(curr_b >= N) {
+            return;
+          }
+          func(curr_b, std::min(curr_b + chunk_size, N));
+        }
+        break;
+      }
+      // coarse-grained
+      else {
+        size_t q = static_cast<size_t>(p2 * r);
+        if(q < chunk_size) {
+          q = chunk_size;
+        }
+        //size_t curr_e = (q <= r) ? curr_b + q : N;
+        size_t curr_e = std::min(curr_b + q, N);
+        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
+                                                        std::memory_order_relaxed)) {
+          func(curr_b, curr_e);
+          curr_b = next.load(std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+
+    size_t p1 = 2 * W * (chunk_size + 1);
+    float  p2 = 0.5f / static_cast<float>(W);
+    size_t curr_b = next.load(std::memory_order_relaxed);
+
+    while(curr_b < N) {
+
+      size_t r = N - curr_b;
+
+      // fine-grained
+      if(r < p1) {
+        while(1) {
+          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+          if(curr_b >= N) {
+            return;
+          }
+          if(func(curr_b, std::min(curr_b + chunk_size, N))) {
+            return;
+          }
+        }
+        break;
+      }
+      // coarse-grained
+      else {
+        size_t q = static_cast<size_t>(p2 * r);
+        if(q < chunk_size) {
+          q = chunk_size;
+        }
+        //size_t curr_e = (q <= r) ? curr_b + q : N;
+        size_t curr_e = std::min(curr_b + q, N);
+        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
+                                                        std::memory_order_relaxed)) {
+          if(func(curr_b, curr_e)) {
+            return;
+          }
+          curr_b = next.load(std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Dynamic Partitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class DynamicPartitioner
+
+@brief class to construct a dynamic partitioner for scheduling parallel algorithms
+
+The partitioner splits iterations into many partitions each of size equal to 
+the given chunk size.
+Different partitions are distributed dynamically to workers 
+without any specific order.
+*/
+class DynamicPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  DynamicPartitioner() : PartitionerBase{1} {};
+  
+  /**
+  @brief construct a dynamic partitioner with the given chunk size
+  */
+  explicit DynamicPartitioner(size_t sz) : PartitionerBase (sz) {}
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      func(curr_b, std::min(curr_b + chunk_size, N));
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      if(func(curr_b, std::min(curr_b + chunk_size, N))) {
+        return;
+      }
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Static Partitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class StaticPartitioner
+
+@brief class to construct a dynamic partitioner for scheduling parallel algorithms
+
+The partitioner divides iterations into chunks and distributes chunks 
+to workers in order.
+If the chunk size is not specified (default @c 0), the partitioner resorts to a chunk size
+that equally distributes iterations into workers.
+
+@code{.cpp}
+std::vector<int> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+taskflow.for_each(
+  data.begin(), data.end(), [](int i){}, StaticPartitioner(0)
+);
+executor.run(taskflow).run();
+@endcode
+*/
+class StaticPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  StaticPartitioner() : PartitionerBase{0} {};
+  
+  /**
+  @brief construct a dynamic partitioner with the given chunk size
+  */
+  explicit StaticPartitioner(size_t sz) : PartitionerBase(sz) {}
+  
+  /**
+  @brief queries the adjusted chunk size
+  
+  Returns the given chunk size if it is not zero, or returns
+  <tt>N/W + (w < N%W)</tt>, where @c N is the number of iterations,
+  @c W is the number of workers, and @c w is the worker ID.
+  */
+  size_t adjusted_chunk_size(size_t N, size_t W, size_t w) const {
+    return _chunk_size ? _chunk_size : N/W + (w < N%W);
+  }
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    size_t curr_b, 
+    size_t chunk_size,
+    F&& func
+  ) {
+    size_t stride = W * chunk_size;
+    while(curr_b < N) {
+      size_t curr_e = std::min(curr_b + chunk_size, N);
+      func(curr_b, curr_e);
+      curr_b += stride;
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    size_t curr_b, 
+    size_t chunk_size,
+    F&& func
+  ) {
+    size_t stride = W * chunk_size;
+    while(curr_b < N) {
+      size_t curr_e = std::min(curr_b + chunk_size, N);
+      if(func(curr_b, curr_e)) {
+        return;
+      }
+      curr_b += stride;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// RandomPartitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class RandomPartitioner
+
+@brief class to construct a random partitioner for scheduling parallel algorithms
+
+Similar to tf::DynamicPartitioner, 
+the partitioner splits iterations into many partitions but each with a random
+chunk size in the range, <tt>c = [alpha * N * W, beta * N * W]</tt>.
+By default, @c alpha is <tt>0.01</tt> and @c beta is <tt>0.5</tt>, respectively.
+
+*/
+class RandomPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  RandomPartitioner() = default;
+  
+  /**
+  @brief constructs a random partitioner 
+  */
+  RandomPartitioner(size_t cz) : PartitionerBase(cz) {}
+  
+  /**
+  @brief constructs a random partitioner with the given parameters
+  */
+  RandomPartitioner(float alpha, float beta) : _alpha {alpha}, _beta {beta} {}
+
+  /**
+  @brief queries the @c alpha value
+  */
+  float alpha() const { return _alpha; }
+  
+  /**
+  @brief queries the @c beta value
+  */
+  float beta() const { return _beta; }
+  
+  /**
+  @brief queries the range of chunk size
+  
+  @param N number of iterations
+  @param W number of workers
+  */
+  std::pair<size_t, size_t> chunk_size_range(size_t N, size_t W) const {
+    
+    size_t b1 = static_cast<size_t>(_alpha * N * W);
+    size_t b2 = static_cast<size_t>(_beta  * N * W);
+
+    if(b1 > b2) {
+      std::swap(b1, b2);
+    }
+
+    b1 = std::max(b1, size_t{1});
+    b2 = std::max(b2, b1 + 1);
+
+    return {b1, b2};
+  }
+
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    auto [b1, b2] = chunk_size_range(N, W); 
+    
+    std::default_random_engine engine {std::random_device{}()};
+    std::uniform_int_distribution<size_t> dist(b1, b2);
+    
+    size_t chunk_size = dist(engine);
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      func(curr_b, std::min(curr_b + chunk_size, N));
+      chunk_size = dist(engine);
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    auto [b1, b2] = chunk_size_range(N, W); 
+    
+    std::default_random_engine engine {std::random_device{}()};
+    std::uniform_int_distribution<size_t> dist(b1, b2);
+    
+    size_t chunk_size = dist(engine);
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      if(func(curr_b, std::min(curr_b + chunk_size, N))){
+        return;
+      }
+      chunk_size = dist(engine);
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+
+  private:
+
+  float _alpha {0.01f};
+  float _beta  {0.5f};
+
+};
+
+/**
+@brief default partitioner set to tf::GuidedPartitioner
+
+Guided partitioner can achieve decent performance for most parallel algorithms,
+especially for those with irregular and unbalanced workload per iteration.
+*/
+using DefaultPartitioner = GuidedPartitioner;
+
+/**
+@brief determines if a type is a partitioner 
+
+A partitioner is a derived type from tf::PartitionerBase.
+*/
+template <typename C>
+inline constexpr bool is_partitioner_v = std::is_base_of<PartitionerBase, C>::value;
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/algorithm/pipeline.hpp b/myxpcs/include/taskflow_/algorithm/pipeline.hpp
new file mode 100644
index 0000000..5442d56
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/pipeline.hpp
@@ -0,0 +1,1663 @@
+#pragma once
+
+#include "../taskflow.hpp"
+
+/**
+@file pipeline.hpp
+@brief pipeline include file
+*/
+
+namespace tf {
+
+
+// ----------------------------------------------------------------------------
+// Structure Definition: DeferredPipeflow
+// ----------------------------------------------------------------------------
+// For example: 
+// 12.defer(7); 12.defer(16);
+//        _____
+//       |     |
+//       v     |
+// 7    12    16
+// |     ^
+// |____ |
+//
+// DeferredPipeflow dpf of 12 :
+// dpf._token = 12;
+// dpf._num_deferrals = 1;
+// dpf._dependents = std::list<size_t>{7,16};
+// dpf._dependent_satellites has following two entries
+// {key: 7, value: dpf._dependents.begin()} 
+// {key: 16, value: dpf._dependents.begin()+1}
+//
+/** @private */
+class DeferredPipeflow {
+
+  template <typename... Ps>
+  friend class Pipeline;
+  
+  template <typename P>
+  friend class ScalablePipeline;
+  
+  public:
+  
+    DeferredPipeflow() = default;
+    DeferredPipeflow(const DeferredPipeflow&) = delete;
+    DeferredPipeflow(DeferredPipeflow&&) = delete;
+  
+    DeferredPipeflow(size_t t, size_t n, std::unordered_set<size_t>&& dep) : 
+      _token{t}, _num_deferrals{n}, _dependents{std::move(dep)} {
+    }
+  
+    DeferredPipeflow& operator = (const DeferredPipeflow&) = delete;
+    DeferredPipeflow& operator = (DeferredPipeflow&&) = delete;
+  
+  private:
+  
+    // token id
+    size_t _token;
+  
+    // number of deferrals
+    size_t _num_deferrals;  
+  
+    // dependents
+    // For example,
+    // 12.defer(7); 12.defer(16)
+    // _dependents = {7, 16}
+    std::unordered_set<size_t> _dependents;
+};
+
+
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipeflow
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipeflow
+
+@brief class to create a pipeflow object used by the pipe callable
+
+Pipeflow represents a <i>scheduling token</i> in the pipeline scheduling
+framework. A pipeflow is created by the pipeline scheduler at runtime to
+pass to the pipe callable. Users can query the present statistics
+of that scheduling token, including the line identifier, pipe identifier,
+and token identifier, and build their application algorithms based on
+these statistics.
+At the first stage, users can explicitly call the stop method
+to stop the pipeline scheduler.
+
+@code{.cpp}
+tf::Pipe{tf::PipeType::SERIAL, [](tf::Pipeflow& pf){
+  std::cout << "token id=" << pf.token()
+            << " at line=" << pf.line()
+            << " at pipe=" << pf.pipe()
+            << '\n';
+}};
+@endcode
+
+Pipeflow can only be created privately by the tf::Pipeline and
+be used through the pipe callable.
+*/
+class Pipeflow {
+
+  template <typename... Ps>
+  friend class Pipeline;
+
+  template <typename P>
+  friend class ScalablePipeline;
+
+  template <typename... Ps>
+  friend class DataPipeline;
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  Pipeflow() = default;
+
+  /**
+  @brief queries the line identifier of the present token
+  */
+  size_t line() const {
+    return _line;
+  }
+
+  /**
+  @brief queries the pipe identifier of the present token
+  */
+  size_t pipe() const {
+    return _pipe;
+  }
+
+  /**
+  @brief queries the token identifier
+  */
+  size_t token() const {
+    return _token;
+  }
+
+  /**
+  @brief stops the pipeline scheduling
+
+  Only the first pipe can call this method to stop the pipeline.
+  Calling stop from other pipes will throw exception.
+  */
+  void stop() {
+    if(_pipe != 0) {
+      TF_THROW("only the first pipe can stop the token");
+    }
+    _stop = true;
+  }
+
+  /**
+  @brief queries the number of deferrals
+  */
+  size_t num_deferrals() const {
+    return _num_deferrals;
+  }
+
+  /**
+  @brief pushes token in _dependents
+
+  Only the first pipe can call this method to defer the current
+  scheduling token to the given token.
+  */
+  void defer(size_t token) {
+    if(_pipe != 0) {
+      TF_THROW("only the first pipe can defer the current scheduling token");
+    }
+    _dependents.insert(token);
+  }
+  
+  private:
+
+  // Regular data
+  size_t _line;
+  size_t _pipe;
+  size_t _token;
+  bool   _stop;
+  
+  // Data field for token dependencies
+  size_t _num_deferrals; 
+  std::unordered_set<size_t> _dependents; 
+
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: PipeType
+// ----------------------------------------------------------------------------
+
+/**
+@enum PipeType
+
+@brief enumeration of all pipe types
+*/
+enum class PipeType : int {
+  /** @brief parallel type */
+  PARALLEL = 1,
+  /** @brief serial type */
+  SERIAL   = 2
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipe
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipe
+
+@brief class to create a pipe object for a pipeline stage
+
+@tparam C callable type
+
+A pipe represents a stage of a pipeline. A pipe can be either
+@em parallel direction or @em serial direction (specified by tf::PipeType)
+and is coupled with a callable to invoke by the pipeline scheduler.
+The callable must take a referenced tf::Pipeflow object in the first argument:
+
+@code{.cpp}
+Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}}
+@endcode
+
+The pipeflow object is used to query the statistics of a scheduling token
+in the pipeline, such as pipe, line, and token numbers.
+*/
+template <typename C = std::function<void(tf::Pipeflow&)>>
+class Pipe {
+
+  template <typename... Ps>
+  friend class Pipeline;
+
+  template <typename P>
+  friend class ScalablePipeline;
+
+  public:
+
+  /**
+  @brief alias of the callable type
+  */
+  using callable_t = C;
+
+  /**
+  @brief default constructor
+  */
+  Pipe() = default;
+
+  /**
+  @brief constructs the pipe object
+
+  @param d pipe type (tf::PipeType)
+  @param callable callable type
+
+  The constructor constructs a pipe with the given direction
+  (tf::PipeType::SERIAL or tf::PipeType::PARALLEL) and the given callable. 
+  The callable must take a referenced tf::Pipeflow object in the first argument.
+
+  @code{.cpp}
+  Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}}
+  @endcode
+
+  When creating a pipeline, the direction of the first pipe must be serial
+  (tf::PipeType::SERIAL).
+  */
+  Pipe(PipeType d, C&& callable) :
+    _type{d}, _callable{std::forward<C>(callable)} {
+  }
+
+  /**
+  @brief queries the type of the pipe
+
+  Returns the type of the callable.
+  */
+  PipeType type() const {
+    return _type;
+  }
+
+  /**
+  @brief assigns a new type to the pipe
+
+  @param type a tf::PipeType variable
+  */
+  void type(PipeType type) {
+    _type = type;
+  }
+
+  /**
+  @brief assigns a new callable to the pipe
+
+  @tparam U callable type
+  @param callable a callable object constructible from std::function<void(tf::Pipeflow&)>
+
+  Assigns a new callable to the pipe with universal forwarding.
+  */
+  template <typename U>
+  void callable(U&& callable) {
+    _callable = std::forward<U>(callable);
+  }
+
+  private:
+
+  PipeType _type;
+
+  C _callable;
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipeline
+
+@brief class to create a pipeline scheduling framework
+
+@tparam Ps pipe types
+
+A pipeline is a composable graph object for users to create a
+<i>pipeline scheduling framework</i> using a module task in a taskflow.
+Unlike the conventional pipeline programming frameworks (e.g., Intel TBB),
+%Taskflow's pipeline algorithm does not provide any data abstraction,
+which often restricts users from optimizing data layouts in their applications,
+but a flexible framework for users to customize their application data
+atop our pipeline scheduling.
+The following code creates a pipeline of four parallel lines to schedule
+tokens through three serial pipes:
+
+@code{.cpp}
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+const size_t num_lines = 4;
+const size_t num_pipes = 3;
+
+// create a custom data buffer
+std::array<std::array<int, num_pipes>, num_lines> buffer;
+
+// create a pipeline graph of four concurrent lines and three serial pipes
+tf::Pipeline pipeline(num_lines,
+  // first pipe must define a serial direction
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf) {
+    // generate only 5 scheduling tokens
+    if(pf.token() == 5) {
+      pf.stop();
+    }
+    // save the token id into the buffer
+    else {
+      buffer[pf.line()][pf.pipe()] = pf.token();
+    }
+  }},
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer] (tf::Pipeflow& pf) {
+    // propagate the previous result to this pipe by adding one
+    buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1;
+  }},
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf){
+    // propagate the previous result to this pipe by adding one
+    buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1;
+  }}
+);
+
+// build the pipeline graph using composition
+tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; })
+                        .name("starting pipeline");
+tf::Task task = taskflow.composed_of(pipeline)
+                        .name("pipeline");
+tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; })
+                        .name("pipeline stopped");
+
+// create task dependency
+init.precede(task);
+task.precede(stop);
+
+// run the pipeline
+executor.run(taskflow).wait();
+@endcode
+
+The above example creates a pipeline graph that schedules five tokens over
+four parallel lines in a circular fashion, as depicted below:
+
+@code{.shell-session}
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+@endcode
+
+At each pipe stage, the program propagates the result to the next pipe
+by adding one to the result stored in a custom data storage, @c buffer.
+The pipeline scheduler will generate five scheduling tokens and then stop.
+
+Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes.
+The definition of each pipe can be different, completely decided by the compiler
+to optimize the object layout.
+After a pipeline is constructed, it is not possible to change its pipes.
+If applications need to change these pipes, please use tf::ScalablePipeline.
+*/
+template <typename... Ps>
+class Pipeline {
+
+  static_assert(sizeof...(Ps)>0, "must have at least one pipe");
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  /**
+  @private
+  */
+  struct PipeMeta {
+    PipeType type;
+  };
+
+  public:
+
+  /**
+  @brief constructs a pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a list of pipes
+
+  Constructs a pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  Pipeline(size_t num_lines, Ps&&... ps);
+
+  /**
+  @brief constructs a pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a tuple of pipes
+
+  Constructs a pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  Pipeline(size_t num_lines, std::tuple<Ps...>&& ps);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  constexpr size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resetting the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero as if the pipeline was just
+  constructed.
+  */
+  void reset();
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of the this pipeline.
+  */
+  Graph& graph();
+
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens;
+
+  std::tuple<Ps...> _pipes;
+  std::array<PipeMeta, sizeof...(Ps)> _meta;
+  std::vector<std::array<Line, sizeof...(Ps)>> _lines;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  
+  // queue of ready tokens (paired with their deferral times)
+  // For example,
+  // when 12 does not have any dependents,
+  // we put 12 in _ready_tokens queue
+  // Assume num_deferrals of 12 is 1,
+  // we push pair{12, 1} in the queue 
+  std::queue<std::pair<size_t, size_t>> _ready_tokens;
+
+  // unordered_map of token dependencies
+  // For example,
+  // 12.defer(16); 13.defer(16);
+  // _token_dependencies has the following entry
+  // {key: 16, value: std::vector{12, 13}}.
+  std::unordered_map<size_t, std::vector<size_t>> _token_dependencies;
+  
+  // unordered_map of deferred tokens
+  // For example,
+  // 12.defer(16); 13.defer(16);
+  // _deferred_tokens has the following two entries
+  // {key: 12, DeferredPipeflow of 12} and
+  // {key: 13, DeferredPipeflow of 13}
+  std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens;
+  
+  // variable to keep track of the longest deferred tokens
+  // For example,
+  // 2.defer(16)
+  // 5.defer(19)
+  // 5.defer(17),
+  // _longest_deferral will be 19 - after token 19 the pipeline
+  // has almost zero cost on handling deferred pipeflow
+  size_t _longest_deferral = 0;  
+  
+  template <size_t... I>
+  auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>);
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+  void _check_dependents(Pipeflow&);
+  void _construct_deferred_tokens(Pipeflow&);
+  void _resolve_token_dependencies(Pipeflow&); 
+};
+
+// constructor
+template <typename... Ps>
+Pipeline<Ps...>::Pipeline(size_t num_lines, Ps&&... ps) :
+  _pipes     {std::make_tuple(std::forward<Ps>(ps)...)},
+  _meta      {PipeMeta{ps.type()}...},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// constructor
+template <typename... Ps>
+Pipeline<Ps...>::Pipeline(size_t num_lines, std::tuple<Ps...>&& ps) :
+  _pipes     {std::forward<std::tuple<Ps...>>(ps)},
+  _meta      {_gen_meta(
+    std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{}
+  )},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// Function: _get_meta
+template <typename... Ps>
+template <size_t... I>
+auto Pipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) {
+  return std::array{PipeMeta{std::get<I>(ps).type()}...};
+}
+
+// Function: num_lines
+template <typename... Ps>
+size_t Pipeline<Ps...>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename... Ps>
+constexpr size_t Pipeline<Ps...>::num_pipes() const noexcept {
+  return sizeof...(Ps);
+}
+
+// Function: num_tokens
+template <typename... Ps>
+size_t Pipeline<Ps...>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename... Ps>
+Graph& Pipeline<Ps...>::graph() {
+  return _graph;
+}
+
+// Function: reset
+template <typename... Ps>
+void Pipeline<Ps...>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+    
+    _pipeflows[l]._num_deferrals = 0;
+    _pipeflows[l]._dependents.clear();
+  }
+  
+  assert(_ready_tokens.empty() == true);
+  _token_dependencies.clear();
+  _deferred_tokens.clear();
+
+  _lines[0][0].join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _lines[l][f].join_counter.store(
+        static_cast<size_t>(_meta[f].type), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _lines[0][f].join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _lines[l][0].join_counter.store(
+      static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed
+    );
+  }
+}
+
+// Procedure: _on_pipe
+template <typename... Ps>
+void Pipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime& rt) {
+  visit_tuple([&](auto&& pipe){
+    using callable_t = typename std::decay_t<decltype(pipe)>::callable_t;
+    if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+      pipe._callable(pf);
+    }
+    else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+      pipe._callable(pf, rt);
+    }
+    else {
+      static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+    }
+  }, _pipes, pf._pipe);
+}
+
+// Procedure: _check_dependents
+// Check and remove invalid dependents after on_pipe
+// For example, users may defer a pipeflow to multiple tokens,
+// and we need to remove invalid tokens.
+//   12.defer(7);   // valid only if 7 is deferred, or invalid otherwise
+//   12.defer(16);  // 16 is valid 
+template <typename... Ps>
+void Pipeline<Ps...>::_check_dependents(Pipeflow& pf) {
+  //if (pf._dependents.size()) {
+  ++pf._num_deferrals;
+  
+  for (auto it = pf._dependents.begin(); it != pf._dependents.end();) {
+ 
+    // valid (e.g., 12.defer(16)) 
+    if (*it >= _num_tokens) {
+      _token_dependencies[*it].push_back(pf._token);
+      _longest_deferral = std::max(_longest_deferral, *it);
+      ++it;
+    }
+    // valid or invalid (e.g., 12.defer(7))
+    else {
+      auto pit = _deferred_tokens.find(*it);
+      
+      // valid (e.g., 7 is deferred)
+      if (pit != _deferred_tokens.end()) {
+        _token_dependencies[*it].push_back(pf._token);
+        ++it;
+      }
+
+      // invalid (e.g., 7 is finished - this this 12.defer(7) is dummy)
+      else {
+        it = pf._dependents.erase(it);
+      }
+    }
+  }
+}
+
+// Procedure: _construct_deferred_tokens
+// Construct a data structure for a deferred token
+// 
+// For example, 
+// 12.defer(7); 12.defer(16);
+// After _check_dependents, 12 needs to be deferred,
+// so we will construct a data structure for 12 using hashmap:
+// {key: 12, value: DeferredPipeflow of 12}
+template <typename... Ps>
+void Pipeline<Ps...>::_construct_deferred_tokens(Pipeflow& pf) {
+  
+  //auto res = _deferred_tokens.emplace(
+  //  pf._token, DeferredPipeflow{pf._token, pf._num_deferrals, std::move(pf._dependents)}
+  //);
+  
+  // construct the deferred pipeflow with zero copy
+  //auto res = _deferred_tokens.emplace(
+  _deferred_tokens.emplace(
+    std::piecewise_construct,
+    std::forward_as_tuple(pf._token),
+    std::forward_as_tuple(
+      pf._token, pf._num_deferrals, std::move(pf._dependents)
+    )
+  );
+
+  //assert(res.second == true);
+}
+
+// Procedure: _resolve_token_dependencies
+// Resolve dependencies for tokens that defer to current token
+// 
+// For example,
+// 12.defer(16);
+// 13.defer(16);
+// _token_dependencies will have the entry
+// {key: 16, value: std::vector{12, 13}} 
+//
+// When 16 finishes, we need to remove 16 from 12's and 13's 
+// individual_dependents
+template <typename... Ps>
+void Pipeline<Ps...>::_resolve_token_dependencies(Pipeflow& pf) {
+
+  if (auto it = _token_dependencies.find(pf._token);
+      it != _token_dependencies.end()) {
+    
+    // iterate tokens that defer to pf._token
+    // (e.g., 12 and 13)
+    for(size_t target : it->second) {
+
+      auto dpf = _deferred_tokens.find(target);
+
+      assert(dpf != _deferred_tokens.end());
+
+      // erase pf._token from target's _dependents
+      // (e.g., remove 16 from 12's dependents)
+      dpf->second._dependents.erase(pf._token);
+      //  dpf->second._dependent_satellites[pf._token]
+      //);
+
+      // target has no dependents
+      if (dpf->second._dependents.empty()) {
+
+        // push target into _ready_tokens queue
+        _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals);
+        //_ready_tokens.push(
+        //  std::make_pair(dpf->second._token, dpf->second._num_deferrals)
+        //);
+        
+        // erase target from _deferred_tokens
+        _deferred_tokens.erase(dpf);
+      }
+    }
+
+    // remove pf._token from _token_dependencies
+    // (e.g., remove the entry
+    // {key: 16, value: std::vector{12, 13}} from _token_dependencies)
+    _token_dependencies.erase(it);
+  }
+}
+
+// Procedure: _build
+template <typename... Ps>
+void Pipeline<Ps...>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _lines[pf->_line][pf->_pipe].join_counter.store(
+        static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed
+      );
+      
+      // First pipe does all jobs of initialization and token dependencies
+      if (pf->_pipe == 0) {
+        // _ready_tokens queue is not empty
+        // substitute pf with the token at the front of the queue
+        if (!_ready_tokens.empty()) {
+          pf->_token = _ready_tokens.front().first;
+          pf->_num_deferrals = _ready_tokens.front().second;
+          _ready_tokens.pop();
+        }
+        else {
+          pf->_token = _num_tokens;
+          pf->_num_deferrals = 0;
+        }
+      
+      handle_token_dependency: 
+
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        
+        if (_num_tokens == pf->_token) {
+          ++_num_tokens;
+        }
+      
+        if (pf->_dependents.empty() == false){ 
+          // check if the pf->_dependents have valid dependents
+          _check_dependents(*pf); 
+          
+          // tokens in pf->_dependents are all valid dependents 
+          if (pf->_dependents.size()) {
+            
+            // construct a data structure for pf in _deferred_tokens 
+            _construct_deferred_tokens(*pf);
+            goto pipeline;
+          }
+
+          // tokens in pf->_dependents are invalid dependents
+          // directly goto on_pipe on the same line
+          else {
+            goto handle_token_dependency;
+          }
+        }
+        
+        // Every token within the deferral range needs to check
+        // if it can resolve dependencies on other tokens.
+        if (pf->_token <= _longest_deferral) {
+          _resolve_token_dependencies(*pf); 
+        }
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_meta[c_f].type == PipeType::SERIAL &&
+         _lines[n_l][c_f].join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_lines[pf->_line][n_f].join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+      
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          // downward dependency 
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          // forward dependency
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// Class Definition: ScalablePipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class ScalablePipeline
+
+@brief class to create a scalable pipeline object
+
+@tparam P type of the iterator to a range of pipes
+
+A scalable pipeline is a composable graph object for users to create a
+<i>pipeline scheduling framework</i> using a module task in a taskflow.
+Unlike tf::Pipeline that instantiates all pipes upon the construction time,
+tf::ScalablePipeline allows variable assignments of pipes using range iterators.
+Users can also reset a scalable pipeline to a different range of pipes
+between runs. The following code creates a scalable pipeline of four
+parallel lines to schedule tokens through three serial pipes in a custom storage,
+then resetting the pipeline to a new range of five serial pipes:
+
+@code{.cpp}
+tf::Taskflow taskflow("pipeline");
+tf::Executor executor;
+
+const size_t num_lines = 4;
+
+// create data storage
+std::array<int, num_lines> buffer;
+
+// define the pipe callable
+auto pipe_callable = [&buffer] (tf::Pipeflow& pf) mutable {
+  switch(pf.pipe()) {
+    // first stage generates only 5 scheduling tokens and saves the
+    // token number into the buffer.
+    case 0: {
+      if(pf.token() == 5) {
+        pf.stop();
+      }
+      else {
+        printf("stage 1: input token = %zu\n", pf.token());
+        buffer[pf.line()] = pf.token();
+      }
+      return;
+    }
+    break;
+
+    // other stages propagate the previous result to this pipe and
+    // increment it by one
+    default: {
+      printf(
+        "stage %zu: input buffer[%zu] = %d\n", pf.pipe(), pf.line(), buffer[pf.line()]
+      );
+      buffer[pf.line()] = buffer[pf.line()] + 1;
+    }
+    break;
+  }
+};
+
+// create a vector of three pipes
+std::vector< tf::Pipe<std::function<void(tf::Pipeflow&)>> > pipes;
+
+for(size_t i=0; i<3; i++) {
+  pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable);
+}
+
+// create a pipeline of four parallel lines based on the given vector of pipes
+tf::ScalablePipeline pl(num_lines, pipes.begin(), pipes.end());
+
+// build the pipeline graph using composition
+tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; })
+                        .name("starting pipeline");
+tf::Task task = taskflow.composed_of(pl)
+                        .name("pipeline");
+tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; })
+                        .name("pipeline stopped");
+
+// create task dependency
+init.precede(task);
+task.precede(stop);
+
+// dump the pipeline graph structure (with composition)
+taskflow.dump(std::cout);
+
+// run the pipeline
+executor.run(taskflow).wait();
+
+// reset the pipeline to a new range of five pipes and starts from
+// the initial state (i.e., token counts from zero)
+for(size_t i=0; i<2; i++) {
+  pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable);
+}
+pl.reset(pipes.begin(), pipes.end());
+
+executor.run(taskflow).wait();
+@endcode
+
+The above example creates a pipeline graph that schedules five tokens over
+four parallel lines in a circular fashion, first going through three serial pipes
+and then five serial pipes:
+
+@code{.shell-session}
+# initial construction of three serial pipes
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+
+# resetting to a new range of five serial pipes
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+@endcode
+
+Each pipe has the same type of `%tf::Pipe<%std::function<void(%tf::Pipeflow&)>>`
+and is kept in a vector that is amenable to change.
+We construct the scalable pipeline using two range iterators pointing to the
+beginning and the end of the vector.
+At each pipe stage, the program propagates the result to the next pipe
+by adding one to the result stored in a custom data storage, @c buffer.
+The pipeline scheduler will generate five scheduling tokens and then stop.
+
+A scalable pipeline is move-only.
+*/
+template <typename P>
+class ScalablePipeline {
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  public:
+
+  /**
+  @brief pipe type
+  */
+  using pipe_t = typename std::iterator_traits<P>::value_type;
+
+  /**
+  @brief default constructor
+  */
+  ScalablePipeline() = default;
+
+  /**
+  @brief constructs an empty scalable pipeline object
+
+  @param num_lines the number of parallel lines
+
+  An empty scalable pipeline does not have any pipes.
+  The pipeline needs to be reset to a valid range of pipes
+  before running.
+  */
+  ScalablePipeline(size_t num_lines);
+
+  /**
+  @brief constructs a scalable pipeline object
+
+  @param num_lines the number of parallel lines
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  Constructs a pipeline from the given range of pipes specified in
+  <tt>[first, last)</tt> using @c num_lines parallel lines.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  ScalablePipeline(size_t num_lines, P first, P last);
+
+  /**
+  @brief disabled copy constructor
+  */
+  ScalablePipeline(const ScalablePipeline&) = delete;
+
+  /**
+  @brief move constructor
+
+  Constructs a pipeline from the given @c rhs using move semantics
+  (i.e. the data in @c rhs is moved into this pipeline).
+  After the move, @c rhs is in a state as if it is just constructed.
+  The behavior is undefined if @c rhs is running during the move.
+  */
+  ScalablePipeline(ScalablePipeline&& rhs);
+
+  /**
+  @brief disabled copy assignment operator
+  */
+  ScalablePipeline& operator = (const ScalablePipeline&) = delete;
+
+  /**
+  @brief move constructor
+
+  Replaces the contents with those of @c rhs using move semantics
+  (i.e. the data in @c rhs is moved into this pipeline).
+  After the move, @c rhs is in a state as if it is just constructed.
+  The behavior is undefined if @c rhs is running during the move.
+  */
+  ScalablePipeline& operator = (ScalablePipeline&& rhs);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resets the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero.
+  */
+  void reset();
+
+  /**
+  @brief resets the pipeline with a new range of pipes
+
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  The member function assigns the pipeline to a new range of pipes
+  specified in <tt>[first, last)</tt> and resets the pipeline to the
+  initial state. After resetting a pipeline, its token identifier will
+  start from zero.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  void reset(P first, P last);
+
+  /**
+  @brief resets the pipeline to a new line number and a
+         new range of pipes
+
+  @param num_lines number of parallel lines
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  The member function resets the pipeline to a new number of
+  parallel lines and a new range of pipes specified in
+  <tt>[first, last)</tt>, as if the pipeline is just constructed.
+  After resetting a pipeline, its token identifier will start from zero.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  void reset(size_t num_lines, P first, P last);
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of the this pipeline.
+  */
+  Graph& graph();
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens{0};
+
+  std::vector<P> _pipes;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  std::unique_ptr<Line[]> _lines;
+
+  // chchiu
+  std::queue<std::pair<size_t, size_t>> _ready_tokens;
+  std::unordered_map<size_t, std::vector<size_t>> _token_dependencies;
+  std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens;
+  size_t _longest_deferral = 0;
+  
+  void _check_dependents(Pipeflow&);
+  void _construct_deferred_tokens(Pipeflow&);
+  void _resolve_token_dependencies(Pipeflow&);
+  // chchiu
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+
+  Line& _line(size_t, size_t);
+};
+
+// constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(size_t num_lines) :
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  _build();
+}
+
+// constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(size_t num_lines, P first, P last) :
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  reset(first, last);
+  _build();
+}
+
+// move constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(ScalablePipeline&& rhs) :
+  _graph              {std::move(rhs._graph)},
+  _num_tokens         {rhs._num_tokens},
+  _pipes              {std::move(rhs._pipes)},
+  _tasks              {std::move(rhs._tasks)},
+  _pipeflows          {std::move(rhs._pipeflows)},
+  _lines              {std::move(rhs._lines)},
+  _ready_tokens       {std::move(rhs._ready_tokens)},
+  _token_dependencies {std::move(rhs._token_dependencies)},
+  _deferred_tokens    {std::move(rhs._deferred_tokens)},
+  _longest_deferral   {rhs._longest_deferral}{
+
+  rhs._longest_deferral = 0;
+  rhs._num_tokens       = 0;
+}
+
+// move assignment operator
+template <typename P>
+ScalablePipeline<P>& ScalablePipeline<P>::operator = (ScalablePipeline&& rhs) {
+  _graph                = std::move(rhs._graph);
+  _num_tokens           = rhs._num_tokens;
+  _pipes                = std::move(rhs._pipes);
+  _tasks                = std::move(rhs._tasks);
+  _pipeflows            = std::move(rhs._pipeflows);
+  _lines                = std::move(rhs._lines);
+  rhs._num_tokens       = 0;
+  _ready_tokens         = std::move(rhs._ready_tokens);
+  _token_dependencies   = std::move(rhs._token_dependencies);
+  _deferred_tokens      = std::move(rhs._deferred_tokens);
+  _longest_deferral     = rhs._longest_deferral;
+  rhs._longest_deferral = 0;
+  return *this;
+}
+
+// Function: num_lines
+template <typename P>
+size_t ScalablePipeline<P>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename P>
+size_t ScalablePipeline<P>::num_pipes() const noexcept {
+  return _pipes.size();
+}
+
+// Function: num_tokens
+template <typename P>
+size_t ScalablePipeline<P>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename P>
+Graph& ScalablePipeline<P>::graph() {
+  return _graph;
+}
+
+// Function: _line
+template <typename P>
+typename ScalablePipeline<P>::Line& ScalablePipeline<P>::_line(size_t l, size_t p) {
+  return _lines[l*num_pipes() + p];
+}
+
+template <typename P>
+void ScalablePipeline<P>::reset(size_t num_lines, P first, P last) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  _graph.clear();
+  _tasks.resize(num_lines + 1);
+  _pipeflows.resize(num_lines);
+
+  reset(first, last);
+
+  _build();
+}
+
+// Function: reset
+template <typename P>
+void ScalablePipeline<P>::reset(P first, P last) {
+
+  size_t num_pipes = static_cast<size_t>(std::distance(first, last));
+
+  if(num_pipes == 0) {
+    TF_THROW("pipeline cannot be empty");
+  }
+
+  if(first->type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  _pipes.resize(num_pipes);
+
+  size_t i=0;
+  for(auto itr = first; itr != last; itr++) {
+    _pipes[i++] = itr;
+  }
+
+  _lines = std::make_unique<Line[]>(num_lines() * _pipes.size());
+
+  reset();
+}
+
+// Function: reset
+template <typename P>
+void ScalablePipeline<P>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+    _pipeflows[l]._num_deferrals = 0;
+    _pipeflows[l]._dependents.clear();
+  }
+
+  _line(0, 0).join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _line(l, f).join_counter.store(
+        static_cast<size_t>(_pipes[f]->type()), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _line(0, f).join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _line(l, 0).join_counter.store(
+      static_cast<size_t>(_pipes[0]->type()) - 1, std::memory_order_relaxed
+    );
+  }
+  
+  assert(_ready_tokens.empty() == true);
+  _token_dependencies.clear();
+  _deferred_tokens.clear();
+}
+
+// Procedure: _on_pipe
+template <typename P>
+void ScalablePipeline<P>::_on_pipe(Pipeflow& pf, Runtime& rt) {
+    
+  using callable_t = typename pipe_t::callable_t;
+
+  if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+    _pipes[pf._pipe]->_callable(pf);
+  }
+  else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+    _pipes[pf._pipe]->_callable(pf, rt);
+  }
+  else {
+    static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+  }
+}
+
+template <typename P>
+void ScalablePipeline<P>::_check_dependents(Pipeflow& pf) {
+  ++pf._num_deferrals;
+  
+  for (auto it = pf._dependents.begin(); it != pf._dependents.end();) {
+ 
+    // valid (e.g., 12.defer(16)) 
+    if (*it >= _num_tokens) {
+      _token_dependencies[*it].push_back(pf._token);
+      _longest_deferral = std::max(_longest_deferral, *it);
+      ++it;
+    }
+    // valid or invalid (e.g., 12.defer(7))
+    else {
+      auto pit = _deferred_tokens.find(*it);
+      
+      // valid (e.g., 7 is deferred)
+      if (pit != _deferred_tokens.end()) {
+        _token_dependencies[*it].push_back(pf._token);
+        ++it;
+      }
+
+      else {
+        it = pf._dependents.erase(it);
+      }
+    }
+  }
+}
+
+// Procedure: _construct_deferred_tokens
+// Construct a data structure for a deferred token
+template <typename P>
+void ScalablePipeline<P>::_construct_deferred_tokens(Pipeflow& pf) {
+  
+  // construct the deferred pipeflow with zero copy
+  _deferred_tokens.emplace(
+    std::piecewise_construct,
+    std::forward_as_tuple(pf._token),
+    std::forward_as_tuple(
+      pf._token, pf._num_deferrals, std::move(pf._dependents)
+    )
+  );
+}
+
+// Procedure: _resolve_token_dependencies
+// Resolve dependencies for tokens that defer to current token
+template <typename P>
+void ScalablePipeline<P>::_resolve_token_dependencies(Pipeflow& pf) {
+
+  if (auto it = _token_dependencies.find(pf._token);
+      it != _token_dependencies.end()) {
+    
+    // iterate tokens that defer to pf._token
+    for(size_t target : it->second) {
+
+      auto dpf = _deferred_tokens.find(target);
+
+      assert(dpf != _deferred_tokens.end());
+
+      // erase pf._token from target's _dependents
+      dpf->second._dependents.erase(pf._token);
+      
+      // target has no dependents
+      if (dpf->second._dependents.empty()) {
+        _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals);
+        _deferred_tokens.erase(dpf);
+      }
+    }
+
+    _token_dependencies.erase(it);
+  }
+}
+
+// Procedure: _build
+template <typename P>
+void ScalablePipeline<P>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _line(pf->_line, pf->_pipe).join_counter.store(
+        static_cast<size_t>(_pipes[pf->_pipe]->type()), std::memory_order_relaxed
+      );
+
+      // First pipe does all jobs of initialization and token dependencies
+      if (pf->_pipe == 0) {
+        // _ready_tokens queue is not empty
+        // substitute pf with the token at the front of the queue
+        if (!_ready_tokens.empty()) {
+          pf->_token = _ready_tokens.front().first;
+          pf->_num_deferrals = _ready_tokens.front().second;
+          _ready_tokens.pop();
+        }
+        else {
+          pf->_token = _num_tokens;
+          pf->_num_deferrals = 0;
+        }
+      
+      handle_token_dependency: 
+
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        
+        if (_num_tokens == pf->_token) {
+          ++_num_tokens;
+        }
+      
+        if (pf->_dependents.empty() == false){ 
+          // check if the pf->_dependents have valid dependents
+          _check_dependents(*pf); 
+          
+          // tokens in pf->_dependents are all valid dependents 
+          if (pf->_dependents.size()) {
+            
+            // construct a data structure for pf in _deferred_tokens 
+            _construct_deferred_tokens(*pf);
+            goto pipeline;
+          }
+
+          // tokens in pf->_dependents are invalid dependents
+          // directly goto on_pipe on the same line
+          else {
+            goto handle_token_dependency;
+          }
+        }
+        
+        // Every token within the deferral range needs to check
+        // if it can resolve dependencies on other tokens.
+        if (pf->_token <= _longest_deferral) {
+          _resolve_token_dependencies(*pf); 
+        }
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_pipes[c_f]->type() == PipeType::SERIAL &&
+         _line(n_l, c_f).join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_line(pf->_line, n_f).join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/algorithm/reduce.hpp b/myxpcs/include/taskflow_/algorithm/reduce.hpp
new file mode 100644
index 0000000..5ee492b
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/reduce.hpp
@@ -0,0 +1,443 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+// Function: make_reduce_task
+template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner>
+TF_FORCE_INLINE auto make_reduce_task(B b, E e, T& init, O bop, P&& part = P()) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b, e, &r=init, bop, part=std::forward<P>(part)] (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(; beg!=end; r = bop(r, *beg++));
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mtx;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum need to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt, [=, &bop, &mtx, &r, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mtx);
+            r = bop(r, *beg);
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T sum = bop(*beg1, *beg2);
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable {
+
+              if(part_b > prev_e) {
+                std::advance(beg, part_b - prev_e);
+              }
+              else {
+                part_b = prev_e;
+              }
+
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                sum = bop(sum, *beg);
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(r, sum);
+
+        });
+      }
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &bop, &mtx, &next, &r, &part] () mutable {
+        // pre-reduce
+        size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+        if(s0 >= N) {
+          return;
+        }
+
+        std::advance(beg, s0);
+
+        if(N - s0 == 1) {
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(r, *beg);
+          return;
+        }
+
+        auto beg1 = beg++;
+        auto beg2 = beg++;
+
+        T sum = bop(*beg1, *beg2);
+        
+        // loop reduce
+        part.loop(N, W, next, 
+          [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            for(size_t x=curr_b; x<curr_e; x++, beg++) {
+              sum = bop(sum, *beg);
+            }
+            prev_e = curr_e;
+          }
+        ); 
+        
+        // final reduce
+        std::lock_guard<std::mutex> lock(mtx);
+        r = bop(r, sum);
+      });
+    }
+  };
+}
+
+// Function: make_transform_reduce_task
+template <
+  typename B, typename E, typename T, typename BOP, typename UOP, 
+  typename P = GuidedPartitioner
+>
+TF_FORCE_INLINE auto make_transform_reduce_task(
+  B b, E e, T& init, BOP bop, UOP uop, P&& part = P()
+) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return [b, e, &r=init, bop, uop, part=std::forward<P>(part)] (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(; beg!=end; r = bop(std::move(r), uop(*beg++)));
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mtx;
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt, [=, &bop, &uop, &mtx, &r, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mtx);
+            r = bop(std::move(r), uop(*beg));
+            return;
+          }
+
+          //auto beg1 = beg++;
+          //auto beg2 = beg++;
+          //T sum = bop(uop(*beg1), uop(*beg2));
+
+          T sum = (chunk_size == 1) ? uop(*beg++) : bop(uop(*beg++), uop(*beg++));
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)]
+            (size_t part_b, size_t part_e) mutable {
+              if(part_b > prev_e) {
+                std::advance(beg, part_b - prev_e);
+              }
+              else {
+                part_b = prev_e;
+              }
+              for(size_t x=part_b; x<part_e; x++, beg++) {
+                sum = bop(std::move(sum), uop(*beg));
+              }
+              prev_e = part_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(std::move(r), std::move(sum));
+
+        });
+      }
+      
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+        
+      launch_loop(N, W, rt, next, part, [=, &bop, &uop, &mtx, &next, &r, &part] () mutable {
+
+        // pre-reduce
+        size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+        if(s0 >= N) {
+          return;
+        }
+
+        std::advance(beg, s0);
+
+        if(N - s0 == 1) {
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(std::move(r), uop(*beg));
+          return;
+        }
+
+        auto beg1 = beg++;
+        auto beg2 = beg++;
+
+        T sum = bop(uop(*beg1), uop(*beg2));
+        
+        // loop reduce
+        part.loop(N, W, next, 
+          [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            for(size_t x=curr_b; x<curr_e; x++, beg++) {
+              sum = bop(std::move(sum), uop(*beg));
+            }
+            prev_e = curr_e;
+          }
+        ); 
+        
+        // final reduce
+        std::lock_guard<std::mutex> lock(mtx);
+        r = bop(std::move(r), std::move(sum));
+      });
+    }
+  };
+}
+
+// Function: make_transform_reduce_task with two binary operation
+template <
+  typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, 
+  typename P = GuidedPartitioner,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>* = nullptr
+>
+TF_FORCE_INLINE auto make_transform_reduce_task(
+  B1 b1, E1 e1, B2 b2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part = P()
+) {
+
+  using B1_t = std::decay_t<unwrap_ref_decay_t<B1>>;
+  using E1_t = std::decay_t<unwrap_ref_decay_t<E1>>;
+  using B2_t = std::decay_t<unwrap_ref_decay_t<B2>>;
+  using namespace std::string_literals;
+
+  return 
+  [b1, e1, b2, &r=init, bop_r, bop_t, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B1_t beg1 = b1;
+    E1_t end1 = e1;
+    B2_t beg2 = b2; 
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg1, end1);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(; beg1!=end1; r = bop_r(std::move(r), bop_t(*beg1++, *beg2++)));
+      return;
+    }   
+
+    if(N < W) {
+      W = N;
+    }   
+
+    std::mutex mtx;
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+    
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+    
+        chunk_size = part.adjusted_chunk_size(N, W, w); 
+
+        launch_loop(W, w, rt, [=, &bop_r, &bop_t, &mtx, &r, &part] () mutable {
+
+          std::advance(beg1, curr_b);
+          std::advance(beg2, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mtx);
+            r = bop_r(std::move(r), bop_t(*beg1, *beg2));
+            return;
+          }   
+
+          T sum = (chunk_size == 1) ? bop_t(*beg1++, *beg2++) : 
+            bop_r(bop_t(*beg1++, *beg2++), bop_t(*beg1++, *beg2++));
+    
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] 
+            (size_t part_b, size_t part_e) mutable {
+              if(part_b > prev_e) {
+                std::advance(beg1, part_b - prev_e);
+                std::advance(beg2, part_b - prev_e);
+              }   
+              else {
+                part_b = prev_e;
+              }   
+              for(size_t x=part_b; x<part_e; x++, beg1++, beg2++) { 
+                sum = bop_r(std::move(sum), bop_t(*beg1, *beg2));
+              }   
+              prev_e = part_e;
+            }   
+          );  
+    
+          // final reduce
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop_r(std::move(r), std::move(sum));
+
+        }); 
+      }   
+    
+      rt.corun_all();
+    }   
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+    
+      launch_loop(N, W, rt, next, part, [=, &bop_r, &bop_t, &mtx, &next, &r, &part] () mutable {
+
+        // pre-reduce
+        size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+        if(s0 >= N) {
+          return;
+        }   
+
+        std::advance(beg1, s0);
+        std::advance(beg2, s0);
+
+        if(N - s0 == 1) {
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop_r(std::move(r), bop_t(*beg1, *beg2));
+          return;
+        }   
+
+        auto beg11 = beg1++;
+        auto beg12 = beg1++;
+        auto beg21 = beg2++;
+        auto beg22 = beg2++;
+
+        T sum = bop_r(bop_t(*beg11, *beg21), bop_t(*beg12, *beg22));
+
+        // loop reduce
+        part.loop(N, W, next, 
+          [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg1, curr_b - prev_e);
+            std::advance(beg2, curr_b - prev_e);
+            for(size_t x=curr_b; x<curr_e; x++, beg1++, beg2++) {
+              sum = bop_r(std::move(sum), bop_t(*beg1, *beg2));
+            }   
+            prev_e = curr_e;
+          }   
+        );  
+    
+        // final reduce
+        std::lock_guard<std::mutex> lock(mtx);
+        r = bop_r(std::move(r), std::move(sum));
+      }); 
+    }   
+  };  
+}
+
+// ----------------------------------------------------------------------------
+// default reduction
+// ----------------------------------------------------------------------------
+
+// Function: reduce
+template <typename B, typename E, typename T, typename O, typename P>
+Task FlowBuilder::reduce(B beg, E end, T& init, O bop, P&& part) {
+  return emplace(make_reduce_task(beg, end, init, bop, std::forward<P>(part)));
+}
+
+// ----------------------------------------------------------------------------
+// default transform and reduction
+// ----------------------------------------------------------------------------
+
+// Function: transform_reduce
+template <typename B, typename E, typename T, typename BOP, typename UOP, typename P,
+  std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>*
+>
+Task FlowBuilder::transform_reduce(
+  B beg, E end, T& init, BOP bop, UOP uop, P&& part
+) {
+  return emplace(make_transform_reduce_task(
+    beg, end, init, bop, uop, std::forward<P>(part)
+  ));
+}
+
+// Function: transform_reduce
+template <
+  typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, 
+  typename P,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>*
+>
+Task FlowBuilder::transform_reduce(
+  B1 beg1, E1 end1, B2 beg2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part
+) {
+  return emplace(make_transform_reduce_task(
+    beg1, end1, beg2, init, bop_r, bop_t, std::forward<P>(part)
+  ));
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
diff --git a/myxpcs/include/taskflow_/algorithm/scan.hpp b/myxpcs/include/taskflow_/algorithm/scan.hpp
new file mode 100644
index 0000000..5a7f01b
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/scan.hpp
@@ -0,0 +1,617 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: scan_loop
+template <typename Iterator, typename BufferT, typename B>
+TF_FORCE_INLINE void scan_loop(
+  tf::Runtime& rt,
+  std::atomic<size_t>& counter, 
+  BufferT& buf, 
+  B&& bop, 
+  Iterator d_beg, 
+  size_t W,
+  size_t w, 
+  size_t chunk_size
+){
+  // whoever finishes the last performs global scan
+  if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) {
+    for(size_t i=1; i<buf.size(); i++) {
+      buf[i].data = bop(buf[i-1].data, buf[i].data);
+    }
+    counter.store(0, std::memory_order_release);
+  }
+
+  // first worker no need to do any work
+  if(w==0) {
+    return;
+  } 
+
+  // need to do public corun because multiple workers can call this
+  rt.executor().corun_until([&counter](){
+    return counter.load(std::memory_order_acquire) == 0;
+  });
+  
+  // block addup
+  for(size_t i=0; i<chunk_size; i++) {
+    *d_beg++ = bop(buf[w-1].data, *d_beg);
+  }
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+
+// Function: make_inclusive_scan_task
+template <typename B, typename E, typename D, typename BOP>
+TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::inclusive_scan(s_beg, s_end, d_beg, bop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+    
+    //auto orig_d_beg = d_beg;
+    //ExecutionPolicy<StaticPartitioner> policy;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = *s_beg++;
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, *s_beg++); 
+        }
+
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+        
+        //size_t offset = R ? Q + 1 : Q;
+        //size_t rest   = N - offset;
+        //size_t rest_Q = rest / W;
+        //size_t rest_R = rest % W;
+        //
+        //chunk_size = policy.chunk_size() == 0 ? 
+        //             rest_Q + (w < rest_R) : policy.chunk_size();
+        //
+        //size_t curr_b = policy.chunk_size() == 0 ? 
+        //                offset + (w<rest_R ? w*(rest_Q + 1) : rest_R + w*rest_Q) :
+        //                offset + w*policy.chunk_size();
+
+        //policy(N, W, curr_b, chunk_size,
+        //  [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+        //    std::advance(orig_d_beg, curr_b - prev_e);
+        //    for(size_t x = curr_b; x<curr_e; x++) {
+        //      size_t j = x < (Q+1)*R ? x/(Q+1) : (x-(Q+1)*R)/Q + R;
+        //      *orig_d_beg++ = bop(buf[j-1].data, *orig_d_beg);
+        //    }
+        //    prev_e = curr_e;
+        //  }
+        //);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+  };
+}
+
+// Function: make_inclusive_scan_task
+template <typename B, typename E, typename D, typename BOP, typename T>
+TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop, T init) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::inclusive_scan(s_beg, s_end, d_beg, bop, init);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+    
+    // set up the initial value for the first worker
+    buf[0].data = std::move(init);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& local = buf[w].data;
+        *d_beg++ = local = (w == 0) ? bop(local, *s_beg++) : *s_beg++;
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = local = bop(local, *s_beg++); 
+        }
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Transform Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP>
+TF_FORCE_INLINE auto make_transform_inclusive_scan_task(
+  B first, E last, D d_first, BOP bop, UOP uop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = uop(*s_beg++);
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, uop(*s_beg++)); 
+        }
+
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+  };
+}
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+TF_FORCE_INLINE auto make_transform_inclusive_scan_task(
+  B first, E last, D d_first, BOP bop, UOP uop, T init
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+    
+    // set up the initial value for the first worker
+    buf[0].data = std::move(init);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& local = buf[w].data;
+        *d_beg++ = local = (w == 0) ? bop(local, uop(*s_beg++)) : uop(*s_beg++);
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = local = bop(local, uop(*s_beg++)); 
+        }
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+    
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: make_exclusive_scan_task
+template <typename B, typename E, typename D, typename T, typename BOP>
+TF_FORCE_INLINE auto make_exclusive_scan_task(
+  B first, E last, D d_first, T init, BOP bop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::exclusive_scan(s_beg, s_end, d_beg, init, bop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    // fetch the init value
+    auto s_beg_temp = s_beg;
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+      chunk_size = std::min(Q + (w<R), N - curr_b);  
+      buf[w].data = w ? *s_beg_temp : std::move(init);
+      std::advance(s_beg_temp, chunk_size - !w);
+      curr_b += chunk_size;
+    }
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& local = buf[w].data;
+
+        for(size_t i=1; i<chunk_size; i++) {
+          auto v = local;
+          local = bop(local, *s_beg++);
+          *d_beg++ = std::move(v);
+        }
+        *d_beg++ = local;
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+    
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Transform Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: 
+template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+TF_FORCE_INLINE auto make_transform_exclusive_scan_task(
+  B first, E last, D d_first, T init, BOP bop, UOP uop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    // fetch the init value
+    auto s_beg_temp = s_beg;
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+      chunk_size = std::min(Q + (w<R), N - curr_b);  
+      buf[w].data = w ? uop(*s_beg_temp) : std::move(init);
+      std::advance(s_beg_temp, chunk_size - !w);
+      curr_b += chunk_size;
+    }
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& local = buf[w].data;
+
+        for(size_t i=1; i<chunk_size; i++) {
+          auto v = local;
+          local = bop(local, uop(*s_beg++));
+          *d_beg++ = std::move(v);
+        }
+        *d_beg++ = local;
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.corun_all();
+    
+  };
+}
+
+
+// ----------------------------------------------------------------------------
+// Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: inclusive_scan
+template <typename B, typename E, typename D, typename BOP>
+Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop) {
+  return emplace(make_inclusive_scan_task(
+    first, last, d_first, bop
+  ));
+}
+
+// Function: inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename T>
+Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init) {
+  return emplace(make_inclusive_scan_task(
+    first, last, d_first, bop, init
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// Transform Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP>
+Task FlowBuilder::transform_inclusive_scan(
+  B first, E last, D d_first, BOP bop, UOP uop
+) {
+  return emplace(make_transform_inclusive_scan_task(
+    first, last, d_first, bop, uop
+  ));
+}
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+Task FlowBuilder::transform_inclusive_scan(
+  B first, E last, D d_first, BOP bop, UOP uop, T init
+) {
+  return emplace(make_transform_inclusive_scan_task(
+    first, last, d_first, bop, uop, init
+  ));  
+}
+
+// ----------------------------------------------------------------------------
+// Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: exclusive_scan
+template <typename B, typename E, typename D, typename T, typename BOP>
+Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop) {
+  return emplace(make_exclusive_scan_task(
+    first, last, d_first, init, bop
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// Transform Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_exclusive_scan
+template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+Task FlowBuilder::transform_exclusive_scan(
+  B first, E last, D d_first, T init, BOP bop, UOP uop
+) {
+  return emplace(make_transform_exclusive_scan_task(
+    first, last, d_first, init, bop, uop
+  )); 
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/algorithm/sort.hpp b/myxpcs/include/taskflow_/algorithm/sort.hpp
new file mode 100644
index 0000000..4460f8f
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/sort.hpp
@@ -0,0 +1,661 @@
+#pragma once
+
+#include "../core/async.hpp"
+
+namespace tf::detail {
+
+// threshold whether or not to perform parallel sort
+template <typename I>
+constexpr size_t parallel_sort_cutoff() {
+
+  //using value_type = std::decay_t<decltype(*std::declval<I>())>;
+  using value_type = typename std::iterator_traits<I>::value_type;
+
+  constexpr size_t object_size = sizeof(value_type);
+
+  if constexpr(std::is_same_v<value_type, std::string>) {
+    return 65536 / sizeof(std::string);
+  }
+  else {
+    if constexpr(object_size < 16) return 4096;
+    else if constexpr(object_size < 32) return 2048;
+    else if constexpr(object_size < 64) return 1024;
+    else if constexpr(object_size < 128) return 768;
+    else if constexpr(object_size < 256) return 512;
+    else if constexpr(object_size < 512) return 256;
+    else return 128;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// pattern-defeating quick sort (pdqsort)
+// https://github.com/orlp/pdqsort/
+// ----------------------------------------------------------------------------
+
+template<typename T, size_t cacheline_size=64>
+inline T* align_cacheline(T* p) {
+#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
+  std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
+#else
+  std::size_t ip = reinterpret_cast<std::size_t>(p);
+#endif
+  ip = (ip + cacheline_size - 1) & -cacheline_size;
+  return reinterpret_cast<T*>(ip);
+}
+
+template<typename Iter>
+inline void swap_offsets(
+  Iter first, Iter last,
+  unsigned char* offsets_l, unsigned char* offsets_r,
+  size_t num, bool use_swaps
+) {
+  typedef typename std::iterator_traits<Iter>::value_type T;
+  if (use_swaps) {
+    // This case is needed for the descending distribution, where we need
+    // to have proper swapping for pdqsort to remain O(n).
+    for (size_t i = 0; i < num; ++i) {
+        std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
+    }
+  } else if (num > 0) {
+    Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
+    T tmp(std::move(*l)); *l = std::move(*r);
+    for (size_t i = 1; i < num; ++i) {
+        l = first + offsets_l[i]; *r = std::move(*l);
+        r = last - offsets_r[i]; *l = std::move(*r);
+    }
+    *r = std::move(tmp);
+  }
+}
+
+// Sorts [begin, end) using insertion sort with the given comparison function.
+template<typename RandItr, typename Compare>
+void insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  if (begin == end) {
+    return;
+  }
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first to avoid 2 moves for an element
+    // already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (shift != begin && comp(tmp, *--shift_1));
+      *shift = std::move(tmp);
+    }
+  }
+}
+
+// Sorts [begin, end) using insertion sort with the given comparison function.
+// Assumes *(begin - 1) is an element smaller than or equal to any element
+// in [begin, end).
+template<typename RandItr, typename Compare>
+void unguarded_insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  if (begin == end) {
+    return;
+  }
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first so we can avoid 2 moves
+    // for an element already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (comp(tmp, *--shift_1));
+
+      *shift = std::move(tmp);
+    }
+  }
+}
+
+// Attempts to use insertion sort on [begin, end).
+// Will return false if more than
+// partial_insertion_sort_limit elements were moved,
+// and abort sorting. Otherwise it will successfully sort and return true.
+template<typename RandItr, typename Compare>
+bool partial_insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+  using D = typename std::iterator_traits<RandItr>::difference_type;
+
+  // When we detect an already sorted partition, attempt an insertion sort
+  // that allows this amount of element moves before giving up.
+  constexpr auto partial_insertion_sort_limit = D{8};
+
+  if (begin == end) return true;
+
+  auto limit = D{0};
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+
+    if (limit > partial_insertion_sort_limit) {
+      return false;
+    }
+
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first so we can avoid 2 moves
+    // for an element already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (shift != begin && comp(tmp, *--shift_1));
+
+      *shift = std::move(tmp);
+      limit += cur - shift;
+    }
+  }
+
+  return true;
+}
+
+// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+// to the pivot are put in the right-hand partition. Returns the position of the pivot after
+// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+// pivot is a median of at least 3 elements and that [begin, end) is at least
+// insertion_sort_threshold long. Uses branchless partitioning.
+template<typename Iter, typename Compare>
+std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
+
+  typedef typename std::iterator_traits<Iter>::value_type T;
+
+  constexpr size_t block_size = 64;
+  constexpr size_t cacheline_size = 64;
+
+  // Move pivot into local for speed.
+  T pivot(std::move(*begin));
+  Iter first = begin;
+  Iter last = end;
+
+  // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+  // this exists).
+  while (comp(*++first, pivot));
+
+  // Find the first element strictly smaller than the pivot. We have to guard this search if
+  // there was no element before *first.
+  if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+  else                    while (                !comp(*--last, pivot));
+
+  // If the first pair of elements that should be swapped to partition are the same element,
+  // the passed in sequence already was correctly partitioned.
+  bool already_partitioned = first >= last;
+  if (!already_partitioned) {
+    std::iter_swap(first, last);
+    ++first;
+
+    // The following branchless partitioning is derived from "BlockQuicksort: How Branch
+    // Mispredictions don't affect Quicksort" by Stefan Edelkamp and Armin Weiss, but
+    // heavily micro-optimized.
+    unsigned char offsets_l_storage[block_size + cacheline_size];
+    unsigned char offsets_r_storage[block_size + cacheline_size];
+    unsigned char* offsets_l = align_cacheline(offsets_l_storage);
+    unsigned char* offsets_r = align_cacheline(offsets_r_storage);
+
+    Iter offsets_l_base = first;
+    Iter offsets_r_base = last;
+    size_t num_l, num_r, start_l, start_r;
+    num_l = num_r = start_l = start_r = 0;
+
+    while (first < last) {
+      // Fill up offset blocks with elements that are on the wrong side.
+      // First we determine how much elements are considered for each offset block.
+      size_t num_unknown = last - first;
+      size_t left_split = num_l == 0 ? (num_r == 0 ? num_unknown / 2 : num_unknown) : 0;
+      size_t right_split = num_r == 0 ? (num_unknown - left_split) : 0;
+
+      // Fill the offset blocks.
+      if (left_split >= block_size) {
+        for (size_t i = 0; i < block_size;) {
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+        }
+      } else {
+        for (size_t i = 0; i < left_split;) {
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+        }
+      }
+
+      if (right_split >= block_size) {
+        for (size_t i = 0; i < block_size;) {
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+        }
+      } else {
+        for (size_t i = 0; i < right_split;) {
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+        }
+      }
+
+      // Swap elements and update block sizes and first/last boundaries.
+      size_t num = std::min(num_l, num_r);
+      swap_offsets(
+        offsets_l_base, offsets_r_base, 
+        offsets_l + start_l, offsets_r + start_r,
+        num, num_l == num_r
+      );
+      num_l -= num; num_r -= num;
+      start_l += num; start_r += num;
+
+      if (num_l == 0) {
+        start_l = 0;
+        offsets_l_base = first;
+      }
+
+      if (num_r == 0) {
+        start_r = 0;
+        offsets_r_base = last;
+      }
+    }
+
+    // We have now fully identified [first, last)'s proper position. Swap the last elements.
+    if (num_l) {
+      offsets_l += start_l;
+      while (num_l--) std::iter_swap(offsets_l_base + offsets_l[num_l], --last);
+      first = last;
+    }
+    if (num_r) {
+      offsets_r += start_r;
+      while (num_r--) std::iter_swap(offsets_r_base - offsets_r[num_r], first), ++first;
+      last = first;
+    }
+  }
+
+  // Put the pivot in the right place.
+  Iter pivot_pos = first - 1;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return std::make_pair(pivot_pos, already_partitioned);
+}
+
+// Partitions [begin, end) around pivot *begin using comparison function comp.
+// Elements equal to the pivot are put in the right-hand partition.
+// Returns the position of the pivot after partitioning and whether the passed
+// sequence already was correctly partitioned.
+// Assumes the pivot is a median of at least 3 elements and that [begin, end)
+// is at least insertion_sort_threshold long.
+template<typename Iter, typename Compare>
+std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
+
+  using T = typename std::iterator_traits<Iter>::value_type;
+
+  // Move pivot into local for speed.
+  T pivot(std::move(*begin));
+
+  Iter first = begin;
+  Iter last = end;
+
+  // Find the first element greater than or equal than the pivot
+  // (the median of 3 guarantees/ this exists).
+  while (comp(*++first, pivot));
+
+  // Find the first element strictly smaller than the pivot.
+  // We have to guard this search if there was no element before *first.
+  if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+  else while (!comp(*--last, pivot));
+
+  // If the first pair of elements that should be swapped to partition
+  // are the same element, the passed in sequence already was correctly
+  // partitioned.
+  bool already_partitioned = first >= last;
+
+  // Keep swapping pairs of elements that are on the wrong side of the pivot.
+  // Previously swapped pairs guard the searches,
+  // which is why the first iteration is special-cased above.
+  while (first < last) {
+    std::iter_swap(first, last);
+    while (comp(*++first, pivot));
+    while (!comp(*--last, pivot));
+  }
+
+  // Put the pivot in the right place.
+  Iter pivot_pos = first - 1;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return std::make_pair(pivot_pos, already_partitioned);
+}
+
+// Similar function to the one above, except elements equal to the pivot
+// are put to the left of the pivot and it doesn't check or return
+// if the passed sequence already was partitioned.
+// Since this is rarely used (the many equal case),
+// and in that case pdqsort already has O(n) performance,
+// no block quicksort is applied here for simplicity.
+template<typename RandItr, typename Compare>
+RandItr partition_left(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  T pivot(std::move(*begin));
+
+  RandItr first = begin;
+  RandItr last = end;
+
+  while (comp(pivot, *--last));
+
+  if (last + 1 == end) {
+    while (first < last && !comp(pivot, *++first));
+  }
+  else {
+    while (!comp(pivot, *++first));
+  }
+
+  while (first < last) {
+    std::iter_swap(first, last);
+    while (comp(pivot, *--last));
+    while (!comp(pivot, *++first));
+  }
+
+  RandItr pivot_pos = last;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return pivot_pos;
+}
+
+template<typename Iter, typename Compare, bool Branchless>
+void parallel_pdqsort(
+  tf::Runtime& rt,
+  Iter begin, Iter end, Compare comp,
+  int bad_allowed, bool leftmost = true
+) {
+
+  // Partitions below this size are sorted sequentially
+  constexpr auto cutoff = parallel_sort_cutoff<Iter>();
+
+  // Partitions below this size are sorted using insertion sort
+  constexpr auto insertion_sort_threshold = 24;
+
+  // Partitions above this size use Tukey's ninther to select the pivot.
+  constexpr auto ninther_threshold = 128;
+
+  //using diff_t = typename std::iterator_traits<Iter>::difference_type;
+
+  // Use a while loop for tail recursion elimination.
+  while (true) {
+
+    //diff_t size = end - begin;
+    size_t size = end - begin;
+
+    // Insertion sort is faster for small arrays.
+    if (size < insertion_sort_threshold) {
+      if (leftmost) {
+        insertion_sort(begin, end, comp);
+      }
+      else {
+        unguarded_insertion_sort(begin, end, comp);
+      }
+      return;
+    }
+
+    if(size <= cutoff) {
+      std::sort(begin, end, comp);
+      return;
+    }
+
+    // Choose pivot as median of 3 or pseudomedian of 9.
+    //diff_t s2 = size / 2;
+    size_t s2 = size >> 1;
+    if (size > ninther_threshold) {
+      sort3(begin, begin + s2, end - 1, comp);
+      sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
+      sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
+      sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
+      std::iter_swap(begin, begin + s2);
+    }
+    else {
+      sort3(begin + s2, begin, end - 1, comp);
+    }
+
+    // If *(begin - 1) is the end of the right partition
+    // of a previous partition operation, there is no element in [begin, end)
+    // that is smaller than *(begin - 1).
+    // Then if our pivot compares equal to *(begin - 1) we change strategy,
+    // putting equal elements in the left partition,
+    // greater elements in the right partition.
+    // We do not have to recurse on the left partition,
+    // since it's sorted (all equal).
+    if (!leftmost && !comp(*(begin - 1), *begin)) {
+      begin = partition_left(begin, end, comp) + 1;
+      continue;
+    }
+
+    // Partition and get results.
+    const auto pair = Branchless ? partition_right_branchless(begin, end, comp) :
+                                   partition_right(begin, end, comp);
+       
+    const auto pivot_pos = pair.first;
+    const auto already_partitioned = pair.second;
+
+    // Check for a highly unbalanced partition.
+    //diff_t l_size = pivot_pos - begin;
+    //diff_t r_size = end - (pivot_pos + 1);
+    const size_t l_size = pivot_pos - begin;
+    const size_t r_size = end - (pivot_pos + 1);
+    const bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
+
+    // If we got a highly unbalanced partition we shuffle elements
+    // to break many patterns.
+    if (highly_unbalanced) {
+      // If we had too many bad partitions, switch to heapsort
+      // to guarantee O(n log n).
+      if (--bad_allowed == 0) {
+        std::make_heap(begin, end, comp);
+        std::sort_heap(begin, end, comp);
+        return;
+      }
+
+      if (l_size >= insertion_sort_threshold) {
+        std::iter_swap(begin, begin + l_size / 4);
+        std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
+        if (l_size > ninther_threshold) {
+          std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
+          std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
+          std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
+          std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
+        }
+      }
+
+      if (r_size >= insertion_sort_threshold) {
+        std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
+        std::iter_swap(end - 1,                   end - r_size / 4);
+        if (r_size > ninther_threshold) {
+          std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
+          std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
+          std::iter_swap(end - 2,             end - (1 + r_size / 4));
+          std::iter_swap(end - 3,             end - (2 + r_size / 4));
+        }
+      }
+    }
+    // decently balanced
+    else {
+      // sequence try to use insertion sort.
+      if (already_partitioned &&
+          partial_insertion_sort(begin, pivot_pos, comp) &&
+          partial_insertion_sort(pivot_pos + 1, end, comp)
+      ) {
+        return;
+      }
+    }
+
+    // Sort the left partition first using recursion and
+    // do tail recursion elimination for the right-hand partition.
+    rt.silent_async(
+      [&rt, begin, pivot_pos, comp, bad_allowed, leftmost] () mutable {
+        parallel_pdqsort<Iter, Compare, Branchless>(
+          rt, begin, pivot_pos, comp, bad_allowed, leftmost
+        );
+      }
+    );
+    begin = pivot_pos + 1;
+    leftmost = false;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// 3-way quick sort
+// ----------------------------------------------------------------------------
+
+// 3-way quick sort
+template <typename RandItr, typename C>
+void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) {
+
+  using namespace std::string_literals;
+
+  constexpr auto cutoff = parallel_sort_cutoff<RandItr>();
+
+  sort_partition:
+
+  if(static_cast<size_t>(last - first) < cutoff) {
+    std::sort(first, last+1, compare);
+    return;
+  }
+
+  auto m = pseudo_median_of_nine(first, last, compare);
+
+  if(m != first) {
+    std::iter_swap(first, m);
+  }
+
+  auto l = first;
+  auto r = last;
+  auto f = std::next(first, 1);
+  bool is_swapped_l = false;
+  bool is_swapped_r = false;
+
+  while(f <= r) {
+    if(compare(*f, *l)) {
+      is_swapped_l = true;
+      std::iter_swap(l, f);
+      l++;
+      f++;
+    }
+    else if(compare(*l, *f)) {
+      is_swapped_r = true;
+      std::iter_swap(r, f);
+      r--;
+    }
+    else {
+      f++;
+    }
+  }
+
+  if(l - first > 1 && is_swapped_l) {
+    //rt.emplace([&](tf::Runtime& rtl) mutable {
+    //  parallel_3wqsort(rtl, first, l-1, compare);
+    //});
+    rt.silent_async([&rt, first, l, &compare] () mutable {
+      parallel_3wqsort(rt, first, l-1, compare);
+    });
+  }
+
+  if(last - r > 1 && is_swapped_r) {
+    //rt.emplace([&](tf::Runtime& rtr) mutable {
+    //  parallel_3wqsort(rtr, r+1, last, compare);
+    //});
+    //rt.silent_async([&rt, r, last, &compare] () mutable {
+    //  parallel_3wqsort(rt, r+1, last, compare);
+    //});
+    first = r+1;
+    goto sort_partition;
+  }
+
+  //rt.join();
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+namespace tf { 
+
+// Function: make_sort_task
+template <typename B, typename E, typename C>
+TF_FORCE_INLINE auto make_sort_task(B b, E e, C cmp) {
+  
+  return [b, e, cmp] (Runtime& rt) mutable {
+
+    using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+    using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    if(beg == end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= detail::parallel_sort_cutoff<B_t>()) {
+      std::sort(beg, end, cmp);
+      return;
+    }
+
+    //parallel_3wqsort(rt, beg, end-1, cmp);
+    detail::parallel_pdqsort<B_t, C,
+      is_std_compare_v<std::decay_t<C>> &&
+      std::is_arithmetic_v<typename std::iterator_traits<B_t>::value_type>
+    >(rt, beg, end, cmp, log2(end - beg));
+
+    rt.corun_all();
+  };
+}
+  
+template <typename B, typename E>
+TF_FORCE_INLINE auto make_sort_task(B beg, E end) {
+  using value_type = std::decay_t<decltype(*std::declval<B>())>;
+  return make_sort_task(beg, end, std::less<value_type>{});
+}
+
+// ----------------------------------------------------------------------------
+// tf::Taskflow::sort
+// ----------------------------------------------------------------------------
+
+// Function: sort
+template <typename B, typename E, typename C>
+Task FlowBuilder::sort(B beg, E end, C cmp) {
+  return emplace(make_sort_task(beg, end, cmp));
+}
+
+// Function: sort
+template <typename B, typename E>
+Task FlowBuilder::sort(B beg, E end) {
+  return emplace(make_sort_task(beg, end));
+}
+
+}  // namespace tf ------------------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/algorithm/transform.hpp b/myxpcs/include/taskflow_/algorithm/transform.hpp
new file mode 100644
index 0000000..37157b3
--- /dev/null
+++ b/myxpcs/include/taskflow_/algorithm/transform.hpp
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+// Function: make_transform_task
+template <
+  typename B, typename E, typename O, typename C, typename P = GuidedPartitioner,
+  std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr
+>
+TF_FORCE_INLINE auto make_transform_task(
+  B first1, E last1, O d_first, C c, P&& part = P()
+) {
+
+  using namespace std::string_literals;
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using O_t = std::decay_t<unwrap_ref_decay_t<O>>;
+  
+  return
+  [first1, last1, d_first, c, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg   = first1;
+    E_t end   = last1;
+    O_t d_beg = d_first;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::transform(beg, end, d_beg, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+              std::advance(beg, part_b - prev_e);
+              std::advance(d_beg, part_b - prev_e);
+              for(size_t x = part_b; x<part_e; x++) {
+                *d_beg++ = c(*beg++);
+              }
+              prev_e = part_e;
+            }
+          ); 
+        });
+      }
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      
+      launch_loop(N, W, rt, next, part, [=, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+            std::advance(beg, part_b - prev_e);
+            std::advance(d_beg, part_b - prev_e);
+            for(size_t x = part_b; x<part_e; x++) {
+              *d_beg++ = c(*beg++);
+            }
+            prev_e = part_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// Function: make_transform_task
+template <
+  typename B1, typename E1, typename B2, typename O, typename C, typename P = GuidedPartitioner,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr
+>
+TF_FORCE_INLINE auto make_transform_task(
+  B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P()
+) {
+
+  using namespace std::string_literals;
+
+  using B1_t = std::decay_t<unwrap_ref_decay_t<B1>>;
+  using E1_t = std::decay_t<unwrap_ref_decay_t<E1>>;
+  using B2_t = std::decay_t<unwrap_ref_decay_t<B2>>;
+  using O_t = std::decay_t<unwrap_ref_decay_t<O>>;
+
+  return
+  [first1, last1, first2, d_first, c, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B1_t beg1 = first1;
+    E1_t end1 = last1;
+    B2_t beg2 = first2;
+    O_t d_beg = d_first;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg1, end1);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::transform(beg1, end1, beg2, d_beg, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+              std::advance(beg1, part_b - prev_e);
+              std::advance(beg2, part_b - prev_e);
+              std::advance(d_beg, part_b - prev_e);
+              for(size_t x = part_b; x<part_e; x++) {
+                *d_beg++ = c(*beg1++, *beg2++);
+              }
+              prev_e = part_e;
+            }
+          ); 
+        });
+      }
+      rt.corun_all();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable {
+            std::advance(beg1, part_b - prev_e);
+            std::advance(beg2, part_b - prev_e);
+            std::advance(d_beg, part_b - prev_e);
+            for(size_t x = part_b; x<part_e; x++) {
+              *d_beg++ = c(*beg1++, *beg2++);
+            }
+            prev_e = part_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// ----------------------------------------------------------------------------
+// transform
+// ----------------------------------------------------------------------------
+
+// Function: transform
+template <typename B, typename E, typename O, typename C, typename P,
+  std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>*
+>
+Task FlowBuilder::transform(B first1, E last1, O d_first, C c, P&& part) {
+  return emplace(
+    make_transform_task(first1, last1, d_first, c, std::forward<P>(part))
+  );
+}
+
+// ----------------------------------------------------------------------------
+// transform2
+// ----------------------------------------------------------------------------
+  
+// Function: transform
+template <
+  typename B1, typename E1, typename B2, typename O, typename C, typename P,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>*
+>
+Task FlowBuilder::transform(
+  B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part
+) {
+  return emplace(make_transform_task(
+    first1, last1, first2, d_first, c, std::forward<P>(part)
+  ));
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/core/async.hpp b/myxpcs/include/taskflow_/core/async.hpp
new file mode 100644
index 0000000..e55082c
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/async.hpp
@@ -0,0 +1,330 @@
+#pragma once
+
+#include "executor.hpp"
+
+// https://hackmd.io/@sysprog/concurrency-atomics
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Async
+// ----------------------------------------------------------------------------
+
+// Function: async
+template <typename F>
+auto Executor::async(const std::string& name, F&& f) {
+
+  _increment_topology();
+
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::packaged_task<R()> p(std::forward<F>(f));
+  auto fu{p.get_future()};
+
+  auto node = node_pool.animate(
+    name, 0, nullptr, nullptr, 0, std::in_place_type_t<Node::Async>{}, 
+    [p=make_moc(std::move(p))]() mutable { p.object(); }
+  );
+
+  _schedule_async_task(node);
+
+  return fu;
+}
+
+// Function: async
+template <typename F>
+auto Executor::async(F&& f) {
+  return async("", std::forward<F>(f));
+}
+
+// ----------------------------------------------------------------------------
+// Silent Async
+// ----------------------------------------------------------------------------
+
+// Function: silent_async
+template <typename F>
+void Executor::silent_async(const std::string& name, F&& f) {
+
+  _increment_topology();
+
+  auto node = node_pool.animate(
+    name, 0, nullptr, nullptr, 0, std::in_place_type_t<Node::Async>{}, 
+    std::forward<F>(f)
+  );
+
+  _schedule_async_task(node);
+}
+
+// Function: silent_async
+template <typename F>
+void Executor::silent_async(F&& f) {
+  silent_async("", std::forward<F>(f));
+}
+
+// ----------------------------------------------------------------------------
+// Async Helper Methods
+// ----------------------------------------------------------------------------
+
+// Procedure: _schedule_async_task
+inline void Executor::_schedule_async_task(Node* node) {  
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else{
+    _schedule(node);
+  }
+}
+
+// Procedure: _tear_down_async
+inline void Executor::_tear_down_async(Node* node) {
+  // from runtime
+  if(node->_parent) {
+    node->_parent->_join_counter.fetch_sub(1, std::memory_order_release);
+  }
+  // from executor
+  else {
+    _decrement_topology();
+  }
+  node_pool.recycle(node);
+}
+
+// ----------------------------------------------------------------------------
+// Silent Dependent Async
+// ----------------------------------------------------------------------------
+
+// Function: silent_dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(F&& func, Tasks&&... tasks) {
+  return silent_dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
+}
+
+// Function: silent_dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(
+  const std::string& name, F&& func, Tasks&&... tasks 
+){
+
+  _increment_topology();
+
+  size_t num_dependents = sizeof...(Tasks);
+  
+  // create a task before scheduling the node to retain a shared ownership first
+  AsyncTask task(node_pool.animate(
+    name, 0, nullptr, nullptr, num_dependents,
+    std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
+  ));
+  
+  if constexpr(sizeof...(Tasks) > 0) {
+    (_process_async_dependent(task._node, tasks, num_dependents), ...);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(task._node);
+  }
+
+  return task;
+}
+
+// Function: silent_dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(F&& func, I first, I last) {
+  return silent_dependent_async("", std::forward<F>(func), first, last);
+}
+
+// Function: silent_dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(
+  const std::string& name, F&& func, I first, I last
+) {
+
+  _increment_topology();
+
+  size_t num_dependents = std::distance(first, last);
+  
+  AsyncTask task(node_pool.animate(
+    name, 0, nullptr, nullptr, num_dependents,
+    std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
+  ));
+  
+  for(; first != last; first++){
+    _process_async_dependent(task._node, *first, num_dependents);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(task._node);
+  }
+
+  return task;
+}
+
+// ----------------------------------------------------------------------------
+// Dependent Async
+// ----------------------------------------------------------------------------
+
+// Function: dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+auto Executor::dependent_async(F&& func, Tasks&&... tasks) {
+  return dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
+}
+
+// Function: dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+auto Executor::dependent_async(
+  const std::string& name, F&& func, Tasks&&... tasks 
+) {
+  
+  _increment_topology();
+  
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::packaged_task<R()> p(std::forward<F>(func));
+  auto fu{p.get_future()};
+
+  size_t num_dependents = sizeof...(tasks);
+
+  AsyncTask task(node_pool.animate(
+    name, 0, nullptr, nullptr, num_dependents,
+    std::in_place_type_t<Node::DependentAsync>{},
+    [p=make_moc(std::move(p))] () mutable { p.object(); }
+  ));
+  
+  if constexpr(sizeof...(Tasks) > 0) {
+    (_process_async_dependent(task._node, tasks, num_dependents), ...);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(task._node);
+  }
+
+  return std::make_pair(std::move(task), std::move(fu));
+}
+
+// Function: dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+auto Executor::dependent_async(F&& func, I first, I last) {
+  return dependent_async("", std::forward<F>(func), first, last);
+}
+
+// Function: dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+auto Executor::dependent_async(
+  const std::string& name, F&& func, I first, I last
+) {
+  
+  _increment_topology();
+  
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::packaged_task<R()> p(std::forward<F>(func));
+  auto fu{p.get_future()};
+
+  size_t num_dependents = std::distance(first, last);
+
+  AsyncTask task(node_pool.animate(
+    name, 0, nullptr, nullptr, num_dependents,
+    std::in_place_type_t<Node::DependentAsync>{},
+    [p=make_moc(std::move(p))] () mutable { p.object(); }
+  ));
+
+  for(; first != last; first++) {
+    _process_async_dependent(task._node, *first, num_dependents);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(task._node);
+  }
+
+  return std::make_pair(std::move(task), std::move(fu));
+}
+
+// ----------------------------------------------------------------------------
+// Dependent Async Helper Functions
+// ----------------------------------------------------------------------------
+
+// Procedure: _process_async_dependent
+inline void Executor::_process_async_dependent(
+  Node* node, tf::AsyncTask& task, size_t& num_dependents
+) {
+
+  auto& state = std::get_if<Node::DependentAsync>(&(task._node->_handle))->state;
+
+  add_successor:
+
+  auto target = Node::AsyncState::UNFINISHED;
+  
+  // acquires the lock
+  if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED,
+                                 std::memory_order_acq_rel,
+                                 std::memory_order_acquire)) {
+    task._node->_successors.push_back(node);
+    state.store(Node::AsyncState::UNFINISHED, std::memory_order_release);
+  }
+  // dep's state is FINISHED, which means dep finished its callable already
+  // thus decrement the node's join counter by 1
+  else if (target == Node::AsyncState::FINISHED) {
+    num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1;
+  }
+  // another worker adding its async task to the same successors of this node
+  else {
+    goto add_successor;
+  }
+}
+
+
+// Procedure: _tear_down_dependent_async
+inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) {
+
+  auto handle = std::get_if<Node::DependentAsync>(&(node->_handle));
+
+  // this async task comes from Executor
+  auto target = Node::AsyncState::UNFINISHED;
+
+  while(!handle->state.compare_exchange_weak(target, Node::AsyncState::FINISHED,
+                                             std::memory_order_acq_rel,
+                                             std::memory_order_relaxed)) {
+    target = Node::AsyncState::UNFINISHED;
+  }
+  
+  // spaw successors whenever their dependencies are resolved
+  worker._cache = nullptr;
+  for(size_t i=0; i<node->_successors.size(); ++i) {
+    if(auto s = node->_successors[i]; 
+      s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1
+    ) {
+      if(worker._cache) {
+        _schedule(worker, worker._cache);
+      }
+      worker._cache = s;
+    }
+  }
+  
+  // now the executor no longer needs to retain ownership
+  if(handle->use_count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+    node_pool.recycle(node);
+  }
+
+  _decrement_topology();
+}
+
+
+
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/core/async_task.hpp b/myxpcs/include/taskflow_/core/async_task.hpp
new file mode 100644
index 0000000..026e8cb
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/async_task.hpp
@@ -0,0 +1,209 @@
+#pragma once
+
+#include "graph.hpp"
+
+/**
+@file async_task.hpp
+@brief asynchronous task include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// AsyncTask
+// ----------------------------------------------------------------------------
+
+/**
+@brief class to create a dependent asynchronous task
+
+A tf::AsyncTask is a lightweight handle that retains @em shared ownership
+of a dependent async task created by an executor.
+This shared ownership ensures that the async task remains alive when
+adding it to the dependency list of another async task, 
+thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem).
+
+@code{.cpp}
+// main thread retains shared ownership of async task A
+tf::AsyncTask A = executor.silent_dependent_async([](){});
+
+// task A remains alive (i.e., at least one ref count by the main thread) 
+// when being added to the dependency list of async task B
+tf::AsyncTask B = executor.silent_dependent_async([](){}, A);
+@endcode
+
+Currently, tf::AsyncTask is implemented based on the logic of 
+C++ smart pointer std::shared_ptr and 
+is considered cheap to copy or move as long as only a handful of objects
+own it.
+When a worker completes an async task, it will remove the task from the executor,
+decrementing the number of shared owners by one.
+If that counter reaches zero, the task is destroyed.
+*/
+class AsyncTask {
+  
+  friend class Executor;
+  
+  public:
+    
+    /**
+    @brief constructs an empty task handle
+    */
+    AsyncTask() = default;
+    
+    /**
+    @brief destroys the managed asynchronous task if this is the last owner
+    */
+    ~AsyncTask();
+    
+    /**
+    @brief constructs an asynchronous task that shares ownership of @c rhs
+    */
+    AsyncTask(const AsyncTask& rhs);
+
+    /**
+    @brief move-constructs an asynchronous task from @c rhs
+    */
+    AsyncTask(AsyncTask&& rhs);
+    
+    /**
+    @brief copy-assigns the asynchronous task from @c rhs
+
+    Releases the managed object of @c this and retains a new shared ownership
+    of @c rhs.
+    */
+    AsyncTask& operator = (const AsyncTask& rhs);
+
+    /**
+    @brief move-assigns the asynchronous task from @c rhs
+    
+    Releases the managed object of @c this and takes over the ownership of @c rhs.
+    */
+    AsyncTask& operator = (AsyncTask&& rhs);
+    
+    /**
+    @brief checks if the asynchronous task stores nothing
+    */
+    bool empty() const;
+
+    /**
+    @brief release the managed object of @c this
+    */
+    void reset();
+    
+    /**
+    @brief obtains a hash value of this asynchronous task
+    */
+    size_t hash_value() const;
+
+    /**
+    @brief returns the number of shared owners that are currently managing 
+           this asynchronous task
+    */
+    size_t use_count() const;
+
+    /**                                                                                                       
+    @brief returns the boolean indicating whether the async task is done
+    */
+    bool is_done() const; 
+
+  private:
+
+    explicit AsyncTask(Node*);
+
+    Node* _node {nullptr};
+
+    void _incref();
+    void _decref();
+};
+
+// Constructor
+inline AsyncTask::AsyncTask(Node* ptr) : _node{ptr} {
+  _incref();
+}
+
+// Function: _incref
+inline void AsyncTask::_incref() {
+  if(_node) {
+    std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.fetch_add(
+      1, std::memory_order_relaxed
+    );
+  }
+}
+
+// Function: _decref
+inline void AsyncTask::_decref() {
+  if(_node && std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.fetch_sub(
+      1, std::memory_order_acq_rel
+    ) == 1) {
+    node_pool.recycle(_node);
+  }
+}
+
+// Copy Constructor
+inline AsyncTask::AsyncTask(const AsyncTask& rhs) : 
+  _node{rhs._node} {
+  _incref();
+}
+
+// Move Constructor
+inline AsyncTask::AsyncTask(AsyncTask&& rhs) :
+  _node {rhs._node} {
+  rhs._node = nullptr;
+}
+
+// Destructor
+inline AsyncTask::~AsyncTask() {
+  _decref();
+}
+
+// Copy assignment
+inline AsyncTask& AsyncTask::operator = (const AsyncTask& rhs) {
+  _decref();
+  _node = rhs._node;
+  _incref();
+  return *this;
+}
+
+// Move assignment
+inline AsyncTask& AsyncTask::operator = (AsyncTask&& rhs) {
+  _decref();
+  _node = rhs._node;
+  rhs._node = nullptr;
+  return *this;
+}
+
+// Function: empty
+inline bool AsyncTask::empty() const {
+  return _node == nullptr;
+}
+
+// Function: reset
+inline void AsyncTask::reset() {
+  _decref();
+  _node = nullptr;
+}
+
+// Function: hash_value
+inline size_t AsyncTask::hash_value() const {
+  return std::hash<Node*>{}(_node);
+}
+
+// Function: use_count
+inline size_t AsyncTask::use_count() const {
+  return _node == nullptr ? size_t{0} : 
+  std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.load(
+    std::memory_order_relaxed
+  );
+}
+
+// Function: is_done
+inline bool AsyncTask::is_done() const {
+  return std::get_if<Node::DependentAsync>(&(_node->_handle))->state.load(
+    std::memory_order_acquire
+  ) == Node::AsyncState::FINISHED;
+}
+
+}  // end of namespace tf ----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/core/declarations.hpp b/myxpcs/include/taskflow_/core/declarations.hpp
new file mode 100644
index 0000000..dd89ab3
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/declarations.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// taskflow
+// ----------------------------------------------------------------------------
+class AsyncTopology;
+class Node;
+class Graph;
+class FlowBuilder;
+class Semaphore;
+class Subflow;
+class Runtime;
+class Task;
+class TaskView;
+class Taskflow;
+class Topology;
+class TopologyBase;
+class Executor;
+class Worker;
+class WorkerView;
+class ObserverInterface;
+class ChromeTracingObserver;
+class TFProfObserver;
+class TFProfManager;
+
+template <typename T>
+class Future;
+
+template <typename...Fs>
+class Pipeline;
+
+// ----------------------------------------------------------------------------
+// cudaFlow
+// ----------------------------------------------------------------------------
+class cudaFlowNode;
+class cudaFlowGraph;
+class cudaTask;
+class cudaFlow;
+class cudaFlowCapturer;
+class cudaFlowOptimizerBase;
+class cudaFlowLinearOptimizer;
+class cudaFlowSequentialOptimizer;
+class cudaFlowRoundRobinOptimizer;
+
+// ----------------------------------------------------------------------------
+// syclFlow
+// ----------------------------------------------------------------------------
+class syclNode;
+class syclGraph;
+class syclTask;
+class syclFlow;
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
diff --git a/myxpcs/include/taskflow_/core/environment.hpp b/myxpcs/include/taskflow_/core/environment.hpp
new file mode 100644
index 0000000..f9013b6
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/environment.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define TF_ENABLE_PROFILER "TF_ENABLE_PROFILER"
+
+namespace tf {
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/core/error.hpp b/myxpcs/include/taskflow_/core/error.hpp
new file mode 100644
index 0000000..6a68bea
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/error.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <exception>
+
+#include "../utility/stream.hpp"
+
+namespace tf {
+
+// Procedure: throw_se
+// Throws the system error under a given error code.
+template <typename... ArgsT>
+//void throw_se(const char* fname, const size_t line, Error::Code c, ArgsT&&... args) {
+void throw_re(const char* fname, const size_t line, ArgsT&&... args) {
+  std::ostringstream oss;
+  oss << "[" << fname << ":" << line << "] ";
+  //ostreamize(oss, std::forward<ArgsT>(args)...);
+  (oss << ... << args);
+  throw std::runtime_error(oss.str());
+}
+
+}  // ------------------------------------------------------------------------
+
+#define TF_THROW(...) tf::throw_re(__FILE__, __LINE__, __VA_ARGS__);
+
diff --git a/myxpcs/include/taskflow_/core/executor-module-opt.hpp b/myxpcs/include/taskflow_/core/executor-module-opt.hpp
new file mode 100644
index 0000000..0e2b1ee
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/executor-module-opt.hpp
@@ -0,0 +1,2025 @@
+#pragma once
+
+#include "observer.hpp"
+#include "taskflow.hpp"
+
+/**
+@file executor.hpp
+@brief executor include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Executor Definition
+// ----------------------------------------------------------------------------
+
+/** @class Executor
+
+@brief class to create an executor for running a taskflow graph
+
+An executor manages a set of worker threads to run one or multiple taskflows
+using an efficient work-stealing scheduling algorithm.
+
+@code{.cpp}
+// Declare an executor and a taskflow
+tf::Executor executor;
+tf::Taskflow taskflow;
+
+// Add three tasks into the taskflow
+tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; });
+tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; });
+tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; });
+
+// Build precedence between tasks
+A.precede(B, C);
+
+tf::Future<void> fu = executor.run(taskflow);
+fu.wait();                // block until the execution completes
+
+executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait();
+executor.run_n(taskflow, 4);
+executor.wait_for_all();  // block until all associated executions finish
+executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait();
+executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; });
+@endcode
+
+All the @c run methods are @em thread-safe. You can submit multiple
+taskflows at the same time to an executor from different threads.
+*/
+class Executor {
+
+  friend class FlowBuilder;
+  friend class Subflow;
+  friend class Runtime;
+
+  public:
+
+    /**
+    @brief constructs the executor with @c N worker threads
+
+    The constructor spawns @c N worker threads to run tasks in a
+    work-stealing loop. The number of workers must be greater than zero
+    or an exception will be thrown.
+    By default, the number of worker threads is equal to the maximum
+    hardware concurrency returned by std::thread::hardware_concurrency.
+    */
+    explicit Executor(size_t N = std::thread::hardware_concurrency());
+
+    /**
+    @brief destructs the executor
+
+    The destructor calls Executor::wait_for_all to wait for all submitted
+    taskflows to complete and then notifies all worker threads to stop
+    and join these threads.
+    */
+    ~Executor();
+
+    /**
+    @brief runs a taskflow once
+
+    @param taskflow a tf::Taskflow object
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow once and returns a tf::Future
+    object that eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(taskflow);
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    tf::Future<void> run(Taskflow& taskflow);
+
+    /**
+    @brief runs a moved taskflow once
+
+    @param taskflow a moved tf::Taskflow object
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow once and returns a tf::Future
+    object that eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(std::move(taskflow));
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    tf::Future<void> run(Taskflow&& taskflow);
+
+    /**
+    @brief runs a taskflow once and invoke a callback upon completion
+
+    @param taskflow a tf::Taskflow object
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow once and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; });
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename C>
+    tf::Future<void> run(Taskflow& taskflow, C&& callable);
+
+    /**
+    @brief runs a moved taskflow once and invoke a callback upon completion
+
+    @param taskflow a moved tf::Taskflow object
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow once and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow), [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename C>
+    tf::Future<void> run(Taskflow&& taskflow, C&& callable);
+
+    /**
+    @brief runs a taskflow for @c N times
+
+    @param taskflow a tf::Taskflow object
+    @param N number of runs
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow @c N times and returns a tf::Future
+    object that eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run_n(taskflow, 2);  // run taskflow 2 times
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    tf::Future<void> run_n(Taskflow& taskflow, size_t N);
+
+    /**
+    @brief runs a moved taskflow for @c N times
+
+    @param taskflow a moved tf::Taskflow object
+    @param N number of runs
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow @c N times and returns a tf::Future
+    object that eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run_n(
+      std::move(taskflow), 2    // run the moved taskflow 2 times
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    tf::Future<void> run_n(Taskflow&& taskflow, size_t N);
+
+    /**
+    @brief runs a taskflow for @c N times and then invokes a callback
+
+    @param taskflow a tf::Taskflow
+    @param N number of runs
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow @c N times and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, 2, [](){ std::cout << "done"; }  // runs taskflow 2 times and invoke
+                                                 // the lambda to print "done"
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename C>
+    tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
+
+    /**
+    @brief runs a moved taskflow for @c N times and then invokes a callback
+
+    @param taskflow a moved tf::Taskflow
+    @param N number of runs
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow @c N times and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      // run the moved taskflow 2 times and invoke the lambda to print "done"
+      std::move(taskflow), 2, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename C>
+    tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable);
+
+    /**
+    @brief runs a taskflow multiple times until the predicate becomes true
+
+    @param taskflow a tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow multiple times until
+    the predicate returns @c true.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, [](){ return rand()%10 == 0 }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename P>
+    tf::Future<void> run_until(Taskflow& taskflow, P&& pred);
+
+    /**
+    @brief runs a moved taskflow and keeps running it
+           until the predicate becomes true
+
+    @param taskflow a moved tf::Taskflow object
+    @param pred a boolean predicate to return @c true for stop
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow multiple times until
+    the predicate returns @c true.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow), [](){ return rand()%10 == 0 }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename P>
+    tf::Future<void> run_until(Taskflow&& taskflow, P&& pred);
+
+    /**
+    @brief runs a taskflow multiple times until the predicate becomes true and
+           then invokes the callback
+
+    @param taskflow a tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+    @param callable a callable object to be invoked after this run completes
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow multiple times until
+    the predicate returns @c true and then invokes the given callable when
+    the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename P, typename C>
+    tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
+
+    /**
+    @brief runs a moved taskflow and keeps running
+           it until the predicate becomes true and then invokes the callback
+
+    @param taskflow a moved tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+    @param callable a callable object to be invoked after this run completes
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow multiple times until
+    the predicate returns @c true and then invokes the given callable when
+    the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow),
+      [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename P, typename C>
+    tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable);
+
+    /**
+    @brief wait for all tasks to complete
+
+    This member function waits until all submitted tasks
+    (e.g., taskflows, asynchronous tasks) to finish.
+
+    @code{.cpp}
+    executor.run(taskflow1);
+    executor.run_n(taskflow2, 10);
+    executor.run_n(taskflow3, 100);
+    executor.wait_for_all();  // wait until the above submitted taskflows finish
+    @endcode
+    */
+    void wait_for_all();
+
+    /**
+    @brief queries the number of worker threads
+
+    Each worker represents one unique thread spawned by an executor
+    upon its construction time.
+
+    @code{.cpp}
+    tf::Executor executor(4);
+    std::cout << executor.num_workers();    // 4
+    @endcode
+    */
+    size_t num_workers() const noexcept;
+
+    /**
+    @brief queries the number of running topologies at the time of this call
+
+    When a taskflow is submitted to an executor, a topology is created to store
+    runtime metadata of the running taskflow.
+    When the execution of the submitted taskflow finishes,
+    its corresponding topology will be removed from the executor.
+
+    @code{.cpp}
+    executor.run(taskflow);
+    std::cout << executor.num_topologies();  // 0 or 1 (taskflow still running)
+    @endcode
+    */
+    size_t num_topologies() const;
+
+    /**
+    @brief queries the number of running taskflows with moved ownership
+
+    @code{.cpp}
+    executor.run(std::move(taskflow));
+    std::cout << executor.num_taskflows();  // 0 or 1 (taskflow still running)
+    @endcode
+    */
+    size_t num_taskflows() const;
+
+    /**
+    @brief queries the id of the caller thread in this executor
+
+    Each worker has an unique id in the range of @c 0 to @c N-1 associated with
+    its parent executor.
+    If the caller thread does not belong to the executor, @c -1 is returned.
+
+    @code{.cpp}
+    tf::Executor executor(4);   // 4 workers in the executor
+    executor.this_worker_id();  // -1 (main thread is not a worker)
+
+    taskflow.emplace([&](){
+      std::cout << executor.this_worker_id();  // 0, 1, 2, or 3
+    });
+    executor.run(taskflow);
+    @endcode
+    */
+    int this_worker_id() const;
+
+    /**
+    @brief runs a given function asynchronously
+
+    @tparam F callable type
+    @tparam ArgsT parameter types
+
+    @param f callable object to call
+    @param args parameters to pass to the callable
+
+    @return a tf::Future that will holds the result of the execution
+
+    The method creates an asynchronous task to launch the given
+    function on the given arguments.
+    Unlike std::async, the return here is a @em tf::Future that holds
+    an optional object to the result.
+    If the asynchronous task is cancelled before it runs, the return is
+    a @c std::nullopt, or the value returned by the callable.
+
+    @code{.cpp}
+    tf::Future<std::optional<int>> future = executor.async([](){
+      std::cout << "create an asynchronous task and returns 1\n";
+      return 1;
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    auto async(F&& f, ArgsT&&... args);
+
+    /**
+    @brief runs a given function asynchronously and gives a name to this task
+
+    @tparam F callable type
+    @tparam ArgsT parameter types
+
+    @param name name of the asynchronous task
+    @param f callable object to call
+    @param args parameters to pass to the callable
+
+    @return a tf::Future that will holds the result of the execution
+
+    The method creates a named asynchronous task to launch the given
+    function on the given arguments.
+    Naming an asynchronous task is primarily used for profiling and visualizing
+    the task execution timeline.
+    Unlike std::async, the return here is a tf::Future that holds
+    an optional object to the result.
+    If the asynchronous task is cancelled before it runs, the return is
+    a @c std::nullopt, or the value returned by the callable.
+
+    @code{.cpp}
+    tf::Future<std::optional<int>> future = executor.named_async("name", [](){
+      std::cout << "create an asynchronous task with a name and returns 1\n";
+      return 1;
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    auto named_async(const std::string& name, F&& f, ArgsT&&... args);
+
+    /**
+    @brief similar to tf::Executor::async but does not return a future object
+
+    This member function is more efficient than tf::Executor::async
+    and is encouraged to use when there is no data returned.
+
+    @code{.cpp}
+    executor.silent_async([](){
+      std::cout << "create an asynchronous task with no return\n";
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    void silent_async(F&& f, ArgsT&&... args);
+
+    /**
+    @brief similar to tf::Executor::named_async but does not return a future object
+
+    This member function is more efficient than tf::Executor::named_async
+    and is encouraged to use when there is no data returned.
+
+    @code{.cpp}
+    executor.named_silent_async("name", [](){
+      std::cout << "create an asynchronous task with a name and no return\n";
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    void named_silent_async(const std::string& name, F&& f, ArgsT&&... args);
+
+    /**
+    @brief constructs an observer to inspect the activities of worker threads
+
+    @tparam Observer observer type derived from tf::ObserverInterface
+    @tparam ArgsT argument parameter pack
+
+    @param args arguments to forward to the constructor of the observer
+
+    @return a shared pointer to the created observer
+
+    Each executor manages a list of observers with shared ownership with callers.
+    For each of these observers, the two member functions,
+    tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit
+    will be called before and after the execution of a task.
+
+    This member function is not thread-safe.
+    */
+    template <typename Observer, typename... ArgsT>
+    std::shared_ptr<Observer> make_observer(ArgsT&&... args);
+
+    /**
+    @brief removes an observer from the executor
+
+    This member function is not thread-safe.
+    */
+    template <typename Observer>
+    void remove_observer(std::shared_ptr<Observer> observer);
+
+    /**
+    @brief queries the number of observers
+    */
+    size_t num_observers() const noexcept;
+
+  private:
+
+    std::condition_variable _topology_cv;
+    std::mutex _taskflow_mutex;
+    std::mutex _topology_mutex;
+    std::mutex _wsq_mutex;
+
+    size_t _num_topologies {0};
+
+    std::unordered_map<std::thread::id, size_t> _wids;
+    std::vector<Worker> _workers;
+    std::vector<std::thread> _threads;
+    std::list<Taskflow> _taskflows;
+
+    Notifier _notifier;
+
+    TaskQueue<Node*> _wsq;
+
+    std::atomic<size_t> _num_actives {0};
+    std::atomic<size_t> _num_thieves {0};
+    std::atomic<bool>   _done {0};
+
+    std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;
+
+    Worker* _this_worker();
+
+    bool _wait_for_task(Worker&, Node*&);
+
+    void _observer_prologue(Worker&, Node*);
+    void _observer_epilogue(Worker&, Node*);
+    void _spawn(size_t);
+    void _worker_loop(Worker&);
+    void _exploit_task(Worker&, Node*&);
+    void _explore_task(Worker&, Node*&);
+    void _consume_task(Worker&, Node*);
+    void _schedule(Worker&, Node*);
+    void _schedule(Node*);
+    void _schedule(Worker&, const SmallVector<Node*>&);
+    void _schedule(const SmallVector<Node*>&);
+    void _set_up_topology(Worker*, Topology*);
+    void _tear_down_topology(Worker&, Topology*);
+    void _tear_down_async(Node*);
+    void _tear_down_invoke(Worker&, Node*);
+    void _cancel_invoke(Worker&, Node*);
+    void _increment_topology();
+    void _decrement_topology();
+    void _decrement_topology_and_notify();
+    void _invoke(Worker&, Node*);
+    void _invoke_static_task(Worker&, Node*);
+    void _invoke_dynamic_task(Worker&, Node*);
+    void _invoke_dynamic_task_external(Worker&, Node*, Graph&, bool);
+    void _invoke_dynamic_task_internal(Worker&, Node*, Graph&);
+    void _invoke_condition_task(Worker&, Node*, SmallVector<int>&);
+    void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&);
+    void _invoke_module_task(Worker&, Node*, bool&);
+    void _invoke_module_task_internal(Worker&, Node*, Graph&, bool&);
+    void _invoke_async_task(Worker&, Node*);
+    void _invoke_silent_async_task(Worker&, Node*);
+    void _invoke_cudaflow_task(Worker&, Node*);
+    void _invoke_syclflow_task(Worker&, Node*);
+    void _invoke_runtime_task(Worker&, Node*);
+
+    template <typename C,
+      std::enable_if_t<is_cudaflow_task_v<C>, void>* = nullptr
+    >
+    void _invoke_cudaflow_task_entry(Node*, C&&);
+
+    template <typename C, typename Q,
+      std::enable_if_t<is_syclflow_task_v<C>, void>* = nullptr
+    >
+    void _invoke_syclflow_task_entry(Node*, C&&, Q&);
+};
+
+// Constructor
+inline Executor::Executor(size_t N) :
+  _workers    {N},
+  _notifier   {N} {
+
+  if(N == 0) {
+    TF_THROW("no cpu workers to execute taskflows");
+  }
+
+  _spawn(N);
+
+  // instantite the default observer if requested
+  if(has_env(TF_ENABLE_PROFILER)) {
+    TFProfManager::get()._manage(make_observer<TFProfObserver>());
+  }
+}
+
+// Destructor
+inline Executor::~Executor() {
+
+  // wait for all topologies to complete
+  wait_for_all();
+
+  // shut down the scheduler
+  _done = true;
+
+  _notifier.notify(true);
+
+  for(auto& t : _threads){
+    t.join();
+  }
+}
+
+// Function: num_workers
+inline size_t Executor::num_workers() const noexcept {
+  return _workers.size();
+}
+
+// Function: num_topologies
+inline size_t Executor::num_topologies() const {
+  return _num_topologies;
+}
+
+// Function: num_taskflows
+inline size_t Executor::num_taskflows() const {
+  return _taskflows.size();
+}
+
+// Function: _this_worker
+inline Worker* Executor::_this_worker() {
+  auto itr = _wids.find(std::this_thread::get_id());
+  return itr == _wids.end() ? nullptr : &_workers[itr->second];
+}
+
+// Function: named_async
+template <typename F, typename... ArgsT>
+auto Executor::named_async(const std::string& name, F&& f, ArgsT&&... args) {
+
+  _increment_topology();
+
+  using T = std::invoke_result_t<F, ArgsT...>;
+  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
+
+  std::promise<R> p;
+
+  auto tpg = std::make_shared<AsyncTopology>();
+
+  Future<R> fu(p.get_future(), tpg);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::Async>{},
+    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...]
+    (bool cancel) mutable {
+      if constexpr(std::is_same_v<R, void>) {
+        if(!cancel) {
+          f(args...);
+        }
+        p.object.set_value();
+      }
+      else {
+        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
+      }
+    },
+    std::move(tpg)
+  );
+
+  node->_name = name;
+
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else{
+    _schedule(node);
+  }
+
+  return fu;
+}
+
+// Function: async
+template <typename F, typename... ArgsT>
+auto Executor::async(F&& f, ArgsT&&... args) {
+  return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: named_silent_async
+template <typename F, typename... ArgsT>
+void Executor::named_silent_async(
+  const std::string& name, F&& f, ArgsT&&... args
+) {
+
+  _increment_topology();
+
+  Node* node = node_pool.animate(
+    std::in_place_type_t<Node::SilentAsync>{},
+    [f=std::forward<F>(f), args...] () mutable {
+      f(args...);
+    }
+  );
+
+  node->_name = name;
+
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else {
+    _schedule(node);
+  }
+}
+
+// Function: silent_async
+template <typename F, typename... ArgsT>
+void Executor::silent_async(F&& f, ArgsT&&... args) {
+  named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: this_worker_id
+inline int Executor::this_worker_id() const {
+  auto i = _wids.find(std::this_thread::get_id());
+  return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id);
+}
+
+// Procedure: _spawn
+inline void Executor::_spawn(size_t N) {
+
+  std::mutex mutex;
+  std::condition_variable cond;
+  size_t n=0;
+
+  for(size_t id=0; id<N; ++id) {
+
+    _workers[id]._id = id;
+    _workers[id]._vtm = id;
+    _workers[id]._executor = this;
+    _workers[id]._waiter = &_notifier._waiters[id];
+
+    _threads.emplace_back([this] (
+      Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n
+    ) -> void {
+
+      // enables the mapping
+      {
+        std::scoped_lock lock(mutex);
+        _wids[std::this_thread::get_id()] = w._id;
+        if(n++; n == num_workers()) {
+          cond.notify_one();
+        }
+      }
+
+      //this_worker().worker = &w;
+
+      Node* t = nullptr;
+
+      // must use 1 as condition instead of !done
+      while(1) {
+
+        // execute the tasks.
+        _exploit_task(w, t);
+
+        // wait for tasks
+        if(_wait_for_task(w, t) == false) {
+          break;
+        }
+      }
+
+    }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n));
+  }
+
+  std::unique_lock<std::mutex> lock(mutex);
+  cond.wait(lock, [&](){ return n==N; });
+}
+
+// Function: _consume_task
+inline void Executor::_consume_task(Worker& w, Node* p) {
+
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  while(p->_join_counter != 0) {
+    exploit:
+    if(auto t = w._wsq.pop(); t) {
+      _invoke(w, t);
+    }
+    else {
+      size_t num_steals = 0;
+      //size_t num_pauses = 0;
+      size_t max_steals = ((_workers.size() + 1) << 1);
+
+      explore:
+
+      t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+      if(t) {
+        _invoke(w, t);
+        goto exploit;
+      }
+      else if(p->_join_counter != 0){
+
+        if(num_steals++ > max_steals) {
+          std::this_thread::yield();
+        }
+
+        //std::this_thread::yield();
+        w._vtm = rdvtm(w._rdgen);
+        goto explore;
+      }
+      else {
+        break;
+      }
+    }
+  }
+}
+
+// Function: _explore_task
+inline void Executor::_explore_task(Worker& w, Node*& t) {
+
+  //assert(_workers[w].wsq.empty());
+  //assert(!t);
+
+  size_t num_steals = 0;
+  size_t num_yields = 0;
+  size_t max_steals = ((_workers.size() + 1) << 1);
+
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  do {
+    t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+
+    if(t) {
+      break;
+    }
+
+    if(num_steals++ > max_steals) {
+      std::this_thread::yield();
+      if(num_yields++ > 100) {
+        break;
+      }
+    }
+
+    w._vtm = rdvtm(w._rdgen);
+  } while(!_done);
+
+}
+
+// Procedure: _exploit_task
+inline void Executor::_exploit_task(Worker& w, Node*& t) {
+
+  if(t) {
+
+    if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) {
+      _notifier.notify(false);
+    }
+
+    while(t) {
+      _invoke(w, t);
+      t = w._wsq.pop();
+    }
+
+    --_num_actives;
+  }
+}
+
+// Function: _wait_for_task
+inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
+
+  wait_for_task:
+
+  //assert(!t);
+
+  ++_num_thieves;
+
+  explore_task:
+
+  _explore_task(worker, t);
+
+  if(t) {
+    if(_num_thieves.fetch_sub(1) == 1) {
+      _notifier.notify(false);
+    }
+    return true;
+  }
+
+  _notifier.prepare_wait(worker._waiter);
+
+  //if(auto vtm = _find_vtm(me); vtm != _workers.size()) {
+  if(!_wsq.empty()) {
+
+    _notifier.cancel_wait(worker._waiter);
+    //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal();
+
+    t = _wsq.steal();  // must steal here
+    if(t) {
+      if(_num_thieves.fetch_sub(1) == 1) {
+        _notifier.notify(false);
+      }
+      return true;
+    }
+    else {
+      worker._vtm = worker._id;
+      goto explore_task;
+    }
+  }
+
+  if(_done) {
+    _notifier.cancel_wait(worker._waiter);
+    _notifier.notify(true);
+    --_num_thieves;
+    return false;
+  }
+
+  if(_num_thieves.fetch_sub(1) == 1) {
+    if(_num_actives) {
+      _notifier.cancel_wait(worker._waiter);
+      goto wait_for_task;
+    }
+    // check all queues again
+    for(auto& w : _workers) {
+      if(!w._wsq.empty()) {
+        worker._vtm = w._id;
+        _notifier.cancel_wait(worker._waiter);
+        goto wait_for_task;
+      }
+    }
+  }
+
+  // Now I really need to relinguish my self to others
+  _notifier.commit_wait(worker._waiter);
+
+  return true;
+}
+
+// Function: make_observer
+template<typename Observer, typename... ArgsT>
+std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  // use a local variable to mimic the constructor
+  auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...);
+
+  ptr->set_up(_workers.size());
+
+  _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
+
+  return ptr;
+}
+
+// Procedure: remove_observer
+template <typename Observer>
+void Executor::remove_observer(std::shared_ptr<Observer> ptr) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
+}
+
+// Function: num_observers
+inline size_t Executor::num_observers() const noexcept {
+  return _observers.size();
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Worker& worker, Node* node) {
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  // caller is a worker to this pool
+  if(worker._executor == this) {
+    worker._wsq.push(node);
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Node* node) {
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(
+  Worker& worker, const SmallVector<Node*>& nodes
+) {
+
+  // We need to cacth the node count to avoid accessing the nodes
+  // vector while the parent topology is removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // make the node ready
+  for(size_t i=0; i<num_nodes; ++i) {
+    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+  }
+
+  if(worker._executor == this) {
+    for(size_t i=0; i<num_nodes; ++i) {
+      worker._wsq.push(nodes[i]);
+    }
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      _wsq.push(nodes[k]);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
+
+  // parent topology may be removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // make the node ready
+  for(size_t i=0; i<num_nodes; ++i) {
+    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      _wsq.push(nodes[k]);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _invoke
+inline void Executor::_invoke(Worker& worker, Node* node) {
+
+  int state;
+  SmallVector<int> conds;
+
+  // synchronize all outstanding memory operations caused by reordering
+  do {
+    state = node->_state.load(std::memory_order_acquire);
+  } while(! (state & Node::READY));
+
+  // unwind stack for deferred node
+  if(state & Node::DEFERRED) {
+    node->_state.fetch_and(~Node::DEFERRED, std::memory_order_relaxed);
+    goto invoke_epilogue;
+  }
+
+  //while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
+
+  invoke_prologue:
+
+  // no need to do other things if the topology is cancelled
+  if(node->_is_cancelled()) {
+    _cancel_invoke(worker, node);
+    return;
+  }
+
+  // if acquiring semaphore(s) exists, acquire them first
+  if(node->_semaphores && !node->_semaphores->to_acquire.empty()) {
+    SmallVector<Node*> nodes;
+    if(!node->_acquire_all(nodes)) {
+      _schedule(worker, nodes);
+      return;
+    }
+    node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release);
+  }
+
+  // condition task
+  //int cond = -1;
+  //SmallVector<int> conds = { -1 };
+
+  // switch is faster than nested if-else due to jump table
+  switch(node->_handle.index()) {
+    // static task
+    case Node::STATIC:{
+      _invoke_static_task(worker, node);
+    }
+    break;
+
+    // dynamic task
+    case Node::DYNAMIC: {
+      _invoke_dynamic_task(worker, node);
+    }
+    break;
+
+    // condition task
+    case Node::CONDITION: {
+      _invoke_condition_task(worker, node, conds);
+    }
+    break;
+
+    // multi-condition task
+    case Node::MULTI_CONDITION: {
+      _invoke_multi_condition_task(worker, node, conds);
+    }
+    break;
+
+    // module task
+    case Node::MODULE: {
+      bool deferred = false;
+      _invoke_module_task(worker, node, deferred);
+      if(deferred) {
+        return;
+      }
+    }
+    break;
+
+    // async task
+    case Node::ASYNC: {
+      _invoke_async_task(worker, node);
+      _tear_down_async(node);
+      return ;
+    }
+    break;
+
+    // silent async task
+    case Node::SILENT_ASYNC: {
+      _invoke_silent_async_task(worker, node);
+      _tear_down_async(node);
+      return ;
+    }
+    break;
+
+    // cudaflow task
+    case Node::CUDAFLOW: {
+      _invoke_cudaflow_task(worker, node);
+    }
+    break;
+
+    // syclflow task
+    case Node::SYCLFLOW: {
+      _invoke_syclflow_task(worker, node);
+    }
+    break;
+
+    // runtime task
+    case Node::RUNTIME: {
+      _invoke_runtime_task(worker, node);
+    }
+    break;
+
+    // monostate (placeholder)
+    default:
+    break;
+  }
+
+  invoke_epilogue:
+
+  // if releasing semaphores exist, release them
+  if(node->_semaphores && !node->_semaphores->to_release.empty()) {
+    _schedule(worker, node->_release_all());
+  }
+
+  // We MUST recover the dependency since the graph may have cycles.
+  // This must be done before scheduling the successors, otherwise this might cause
+  // race condition on the _dependents
+  if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) {
+    node->_join_counter = node->num_strong_dependents();
+  }
+  else {
+    node->_join_counter = node->num_dependents();
+  }
+
+  // acquire the parent flow counter
+  auto& j = (node->_parent) ? node->_parent->_join_counter :
+                              node->_topology->_join_counter;
+
+  Node* cache {nullptr};
+
+  // At this point, the node storage might be destructed (to be verified)
+  // case 1: non-condition task
+  switch(node->_handle.index()) {
+
+    // condition and multi-condition tasks
+    case Node::CONDITION:
+    case Node::MULTI_CONDITION: {
+      for(auto cond : conds) {
+        if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) {
+          auto s = node->_successors[cond];
+          // zeroing the join counter for invariant
+          s->_join_counter.store(0, std::memory_order_relaxed);
+          j.fetch_add(1);
+          if(cache) {
+            _schedule(worker, cache);
+          }
+          cache = s;
+        }
+      }
+    }
+    break;
+
+    // non-condition task
+    default: {
+      for(size_t i=0; i<node->_successors.size(); ++i) {
+        if(--(node->_successors[i]->_join_counter) == 0) {
+          j.fetch_add(1);
+          if(cache) {
+            _schedule(worker, cache);
+          }
+          cache = node->_successors[i];
+        }
+      }
+    }
+    break;
+  }
+
+  // tear_down the invoke
+  _tear_down_invoke(worker, node);
+
+  // perform tail recursion elimination for the right-most child to reduce
+  // the number of expensive pop/push operations through the task queue
+  if(cache) {
+    node = cache;
+    //node->_state.fetch_or(Node::READY, std::memory_order_release);
+    goto invoke_prologue;
+  }
+}
+
+// Procedure: _tear_down_async
+inline void Executor::_tear_down_async(Node* node) {
+  if(node->_parent) {
+    node->_parent->_join_counter.fetch_sub(1);
+  }
+  else {
+    _decrement_topology_and_notify();
+  }
+  node_pool.recycle(node);
+}
+
+// Proecdure: _tear_down_invoke
+inline void Executor::_tear_down_invoke(Worker& worker, Node* node) {
+  // we must check parent first before substracting the join counter,
+  // or it can introduce data race
+  if(auto parent = node->_parent; parent == nullptr) {
+    if(node->_topology->_join_counter.fetch_sub(1) == 1) {
+      _tear_down_topology(worker, node->_topology);
+    }
+  }
+  else {
+    // prefetch the deferred status, as subtracting the join counter can
+    // immediately cause the other worker to release the subflow
+    auto deferred = parent->_state.load(std::memory_order_relaxed) & Node::DEFERRED;
+    if(parent->_join_counter.fetch_sub(1) == 1 && deferred) {
+      _schedule(worker, parent);
+    }
+  }
+}
+
+// Procedure: _cancel_invoke
+inline void Executor::_cancel_invoke(Worker& worker, Node* node) {
+
+  switch(node->_handle.index()) {
+    // async task needs to carry out the promise
+    case Node::ASYNC:
+      std::get_if<Node::Async>(&(node->_handle))->work(true);
+      _tear_down_async(node);
+    break;
+
+    // silent async doesn't need to carry out the promise
+    case Node::SILENT_ASYNC:
+      _tear_down_async(node);
+    break;
+
+    // tear down topology if the node is the last leaf
+    default: {
+      _tear_down_invoke(worker, node);
+    }
+    break;
+  }
+}
+
+// Procedure: _observer_prologue
+inline void Executor::_observer_prologue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_entry(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _observer_epilogue
+inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_exit(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _invoke_static_task
+inline void Executor::_invoke_static_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::Static>(&node->_handle)->work();
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_dynamic_task
+inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) {
+
+  _observer_prologue(w, node);
+
+  auto handle = std::get_if<Node::Dynamic>(&node->_handle);
+
+  handle->subgraph._clear();
+
+  Subflow sf(*this, w, node, handle->subgraph);
+
+  handle->work(sf);
+
+  if(sf._joinable) {
+    _invoke_dynamic_task_internal(w, node, handle->subgraph);
+  }
+
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_dynamic_task_external
+inline void Executor::_invoke_dynamic_task_external(
+  Worker& w, Node* p, Graph& g, bool detach
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+
+    if(detach) {
+      n->_parent = nullptr;
+      n->_state.fetch_or(Node::DETACHED, std::memory_order_relaxed);
+    }
+    else {
+      n->_parent = p;
+    }
+
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+
+  // detach here
+  if(detach) {
+
+    {
+      std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex);
+      p->_topology->_taskflow._graph._merge(std::move(g));
+    }
+
+    p->_topology->_join_counter.fetch_add(src.size());
+    _schedule(w, src);
+  }
+  // join here
+  else {
+    p->_join_counter.fetch_add(src.size());
+    _schedule(w, src);
+    _consume_task(w, p);
+  }
+}
+
+// Procedure: _invoke_dynamic_task_internal
+inline void Executor::_invoke_dynamic_task_internal(
+  Worker& w, Node* p, Graph& g
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+    n->_parent = p;
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+  p->_join_counter.fetch_add(src.size());
+  _schedule(w, src);
+  _consume_task(w, p);
+}
+
+// Procedure: _invoke_module_task_internal
+inline void Executor::_invoke_module_task_internal(
+  Worker& w, Node* p, Graph& g, bool& deferred
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty()) {
+    return;
+  }
+
+  // set deferred
+  deferred = true;
+  p->_state.fetch_or(Node::DEFERRED, std::memory_order_relaxed);
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+    n->_parent = p;
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+  p->_join_counter.fetch_add(src.size());
+  _schedule(w, src);
+}
+
+// Procedure: _invoke_condition_task
+inline void Executor::_invoke_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  conds = { std::get_if<Node::Condition>(&node->_handle)->work() };
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_multi_condition_task
+inline void Executor::_invoke_multi_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  conds = std::get_if<Node::MultiCondition>(&node->_handle)->work();
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_cudaflow_task
+inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::cudaFlow>(&node->_handle)->work(*this, node);
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_syclflow_task
+inline void Executor::_invoke_syclflow_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::syclFlow>(&node->_handle)->work(*this, node);
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_module_task
+inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& deferred) {
+  _observer_prologue(w, node);
+  _invoke_module_task_internal(
+    w, node, std::get_if<Node::Module>(&node->_handle)->graph, deferred
+  );
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_async_task
+inline void Executor::_invoke_async_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  std::get_if<Node::Async>(&node->_handle)->work(false);
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_silent_async_task
+inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  std::get_if<Node::SilentAsync>(&node->_handle)->work();
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_runtime_task
+inline void Executor::_invoke_runtime_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  Runtime rt(*this, w, node);
+  std::get_if<Node::Runtime>(&node->_handle)->work(rt);
+  _observer_epilogue(w, node);
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow& f) {
+  return run_n(f, 1, [](){});
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow&& f) {
+  return run_n(std::move(f), 1, [](){});
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow& f, C&& c) {
+  return run_n(f, 1, std::forward<C>(c));
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow&& f, C&& c) {
+  return run_n(std::move(f), 1, std::forward<C>(c));
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) {
+  return run_n(f, repeat, [](){});
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) {
+  return run_n(std::move(f), repeat, [](){});
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
+  return run_until(
+    f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) {
+  return run_until(
+    std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) {
+  return run_until(f, std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) {
+  return run_until(std::move(f), std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) {
+
+  _increment_topology();
+
+  // Need to check the empty under the lock since dynamic task may
+  // define detached blocks that modify the taskflow at the same time
+  bool empty;
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    empty = f.empty();
+  }
+
+  // No need to create a real topology but returns an dummy future
+  if(empty || p()) {
+    c();
+    std::promise<void> promise;
+    promise.set_value();
+    _decrement_topology_and_notify();
+    return tf::Future<void>(promise.get_future(), std::monostate{});
+  }
+
+  // create a topology for this run
+  auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c));
+
+  // need to create future before the topology got torn down quickly
+  tf::Future<void> future(t->_promise.get_future(), t);
+
+  // modifying topology needs to be protected under the lock
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    f._topologies.push(t);
+    if(f._topologies.size() == 1) {
+      _set_up_topology(_this_worker(), t.get());
+    }
+  }
+
+  return future;
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) {
+
+  std::list<Taskflow>::iterator itr;
+
+  {
+    std::scoped_lock<std::mutex> lock(_taskflow_mutex);
+    itr = _taskflows.emplace(_taskflows.end(), std::move(f));
+    itr->_satellite = itr;
+  }
+
+  return run_until(*itr, std::forward<P>(pred), std::forward<C>(c));
+}
+
+// Procedure: _increment_topology
+inline void Executor::_increment_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  ++_num_topologies;
+}
+
+// Procedure: _decrement_topology_and_notify
+inline void Executor::_decrement_topology_and_notify() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  if(--_num_topologies == 0) {
+    _topology_cv.notify_all();
+  }
+}
+
+// Procedure: _decrement_topology
+inline void Executor::_decrement_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  --_num_topologies;
+}
+
+// Procedure: wait_for_all
+inline void Executor::wait_for_all() {
+  std::unique_lock<std::mutex> lock(_topology_mutex);
+  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
+}
+
+// Function: _set_up_topology
+inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) {
+
+  // ---- under taskflow lock ----
+
+  tpg->_sources.clear();
+  tpg->_taskflow._graph._clear_detached();
+
+  // scan each node in the graph and build up the links
+  for(auto node : tpg->_taskflow._graph._nodes) {
+
+    node->_topology = tpg;
+    node->_state.store(0, std::memory_order_relaxed);
+
+    if(node->num_dependents() == 0) {
+      tpg->_sources.push_back(node);
+    }
+
+    node->_set_up_join_counter();
+  }
+
+  tpg->_join_counter = tpg->_sources.size();
+
+  if(worker) {
+    _schedule(*worker, tpg->_sources);
+  }
+  else {
+    _schedule(tpg->_sources);
+  }
+}
+
+// Function: _tear_down_topology
+inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) {
+
+  auto &f = tpg->_taskflow;
+
+  //assert(&tpg == &(f._topologies.front()));
+
+  // case 1: we still need to run the topology again
+  if(!tpg->_is_cancelled && !tpg->_pred()) {
+    //assert(tpg->_join_counter == 0);
+    std::lock_guard<std::mutex> lock(f._mutex);
+    tpg->_join_counter = tpg->_sources.size();
+    _schedule(worker, tpg->_sources);
+  }
+  // case 2: the final run of this topology
+  else {
+
+    // TODO: if the topology is cancelled, need to release all semaphores
+
+    if(tpg->_call != nullptr) {
+      tpg->_call();
+    }
+
+    // If there is another run (interleave between lock)
+    if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) {
+      //assert(tpg->_join_counter == 0);
+
+      // Set the promise
+      tpg->_promise.set_value();
+      f._topologies.pop();
+      tpg = f._topologies.front().get();
+
+      // decrement the topology but since this is not the last we don't notify
+      _decrement_topology();
+
+      // set up topology needs to be under the lock or it can
+      // introduce memory order error with pop
+      _set_up_topology(&worker, tpg);
+    }
+    else {
+      //assert(f._topologies.size() == 1);
+
+      // Need to back up the promise first here becuz taskflow might be
+      // destroy soon after calling get
+      auto p {std::move(tpg->_promise)};
+
+      // Back up lambda capture in case it has the topology pointer,
+      // to avoid it releasing on pop_front ahead of _mutex.unlock &
+      // _promise.set_value. Released safely when leaving scope.
+      auto c {std::move(tpg->_call)};
+
+      // Get the satellite if any
+      auto s {f._satellite};
+
+      // Now we remove the topology from this taskflow
+      f._topologies.pop();
+
+      //f._mutex.unlock();
+      lock.unlock();
+
+      // We set the promise in the end in case taskflow leaves the scope.
+      // After set_value, the caller will return from wait
+      p.set_value();
+
+      _decrement_topology_and_notify();
+
+      // remove the taskflow if it is managed by the executor
+      // TODO: in the future, we may need to synchronize on wait
+      // (which means the following code should the moved before set_value)
+      if(s) {
+        std::scoped_lock<std::mutex> lock(_taskflow_mutex);
+        _taskflows.erase(*s);
+      }
+    }
+  }
+}
+
+// ############################################################################
+// Forward Declaration: Subflow
+// ############################################################################
+
+inline void Subflow::join() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow not joinable");
+  }
+
+  // only the parent worker can join the subflow
+  _executor._invoke_dynamic_task_external(_worker, _parent, _graph, false);
+  _joinable = false;
+}
+
+inline void Subflow::detach() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow already joined or detached");
+  }
+
+  // only the parent worker can detach the subflow
+  _executor._invoke_dynamic_task_external(_worker, _parent, _graph, true);
+  _joinable = false;
+}
+
+// Function: named_async
+template <typename F, typename... ArgsT>
+auto Subflow::named_async(const std::string& name, F&& f, ArgsT&&... args) {
+  return _named_async(
+    *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)...
+  );
+}
+
+// Function: _named_async
+template <typename F, typename... ArgsT>
+auto Subflow::_named_async(
+  Worker& w,
+  const std::string& name,
+  F&& f,
+  ArgsT&&... args
+) {
+
+  _parent->_join_counter.fetch_add(1);
+
+  using T = std::invoke_result_t<F, ArgsT...>;
+  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
+
+  std::promise<R> p;
+
+  auto tpg = std::make_shared<AsyncTopology>();
+
+  Future<R> fu(p.get_future(), tpg);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::Async>{},
+    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...]
+    (bool cancel) mutable {
+      if constexpr(std::is_same_v<R, void>) {
+        if(!cancel) {
+          f(args...);
+        }
+        p.object.set_value();
+      }
+      else {
+        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
+      }
+    },
+    std::move(tpg)
+  );
+
+  node->_name = name;
+  node->_topology = _parent->_topology;
+  node->_parent = _parent;
+
+  _executor._schedule(w, node);
+
+  return fu;
+}
+
+// Function: async
+template <typename F, typename... ArgsT>
+auto Subflow::async(F&& f, ArgsT&&... args) {
+  return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: _named_silent_async
+template <typename F, typename... ArgsT>
+void Subflow::_named_silent_async(
+  Worker& w, const std::string& name, F&& f, ArgsT&&... args
+) {
+
+  _parent->_join_counter.fetch_add(1);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::SilentAsync>{},
+    [f=std::forward<F>(f), args...] () mutable {
+      f(args...);
+    }
+  );
+
+  node->_name = name;
+  node->_topology = _parent->_topology;
+  node->_parent = _parent;
+
+  _executor._schedule(w, node);
+}
+
+// Function: silent_async
+template <typename F, typename... ArgsT>
+void Subflow::named_silent_async(const std::string& name, F&& f, ArgsT&&... args) {
+  _named_silent_async(
+    *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)...
+  );
+}
+
+// Function: named_silent_async
+template <typename F, typename... ArgsT>
+void Subflow::silent_async(F&& f, ArgsT&&... args) {
+  named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// ############################################################################
+// Forward Declaration: Runtime
+// ############################################################################
+
+// Procedure: schedule
+inline void Runtime::schedule(Task task) {
+  auto node = task._node;
+  auto& j = node->_parent ? node->_parent->_join_counter :
+                            node->_topology->_join_counter;
+  j.fetch_add(1);
+  _executor._schedule(_worker, node);
+}
+
+// Procedure: run
+template <typename C>
+void Runtime::run(C&& callable) {
+
+  // dynamic task (subflow)
+  if constexpr(is_dynamic_task_v<C>) {
+    Graph graph;
+    Subflow sf(_executor, _worker, _parent, graph);
+    callable(sf);
+    if(sf._joinable) {
+      _executor._invoke_dynamic_task_internal(_worker, _parent, graph);
+    }
+  }
+  else {
+    static_assert(dependent_false_v<C>, "unsupported task callable to run");
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/core/executor.hpp b/myxpcs/include/taskflow_/core/executor.hpp
new file mode 100644
index 0000000..2a549cc
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/executor.hpp
@@ -0,0 +1,2385 @@
+#pragma once
+
+#include "observer.hpp"
+#include "taskflow.hpp"
+#include "async_task.hpp"
+
+/**
+@file executor.hpp
+@brief executor include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Executor Definition
+// ----------------------------------------------------------------------------
+
+/** @class Executor
+
+@brief class to create an executor for running a taskflow graph
+
+An executor manages a set of worker threads to run one or multiple taskflows
+using an efficient work-stealing scheduling algorithm.
+
+@code{.cpp}
+// Declare an executor and a taskflow
+tf::Executor executor;
+tf::Taskflow taskflow;
+
+// Add three tasks into the taskflow
+tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; });
+tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; });
+tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; });
+
+// Build precedence between tasks
+A.precede(B, C);
+
+tf::Future<void> fu = executor.run(taskflow);
+fu.wait();                // block until the execution completes
+
+executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait();
+executor.run_n(taskflow, 4);
+executor.wait_for_all();  // block until all associated executions finish
+executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait();
+executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; });
+@endcode
+
+All the @c run methods are @em thread-safe. You can submit multiple
+taskflows at the same time to an executor from different threads.
+*/
+class Executor {
+
+  friend class FlowBuilder;
+  friend class Subflow;
+  friend class Runtime;
+
+  public:
+
+  /**
+  @brief constructs the executor with @c N worker threads
+
+  @param N the number of workers (default std::thread::hardware_concurrency)
+  
+  The constructor spawns @c N worker threads to run tasks in a
+  work-stealing loop. The number of workers must be greater than zero
+  or an exception will be thrown.
+  By default, the number of worker threads is equal to the maximum
+  hardware concurrency returned by std::thread::hardware_concurrency.
+  */
+  explicit Executor(size_t N = std::thread::hardware_concurrency());
+
+  /**
+  @brief destructs the executor
+
+  The destructor calls Executor::wait_for_all to wait for all submitted
+  taskflows to complete and then notifies all worker threads to stop
+  and join these threads.
+  */
+  ~Executor();
+
+  /**
+  @brief runs a taskflow once
+
+  @param taskflow a tf::Taskflow object
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow once and returns a tf::Future
+  object that eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(taskflow);
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  tf::Future<void> run(Taskflow& taskflow);
+
+  /**
+  @brief runs a moved taskflow once
+
+  @param taskflow a moved tf::Taskflow object
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow once and returns a tf::Future
+  object that eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(std::move(taskflow));
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  tf::Future<void> run(Taskflow&& taskflow);
+
+  /**
+  @brief runs a taskflow once and invoke a callback upon completion
+
+  @param taskflow a tf::Taskflow object
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow once and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; });
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename C>
+  tf::Future<void> run(Taskflow& taskflow, C&& callable);
+
+  /**
+  @brief runs a moved taskflow once and invoke a callback upon completion
+
+  @param taskflow a moved tf::Taskflow object
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow once and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(
+    std::move(taskflow), [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename C>
+  tf::Future<void> run(Taskflow&& taskflow, C&& callable);
+
+  /**
+  @brief runs a taskflow for @c N times
+
+  @param taskflow a tf::Taskflow object
+  @param N number of runs
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow @c N times and returns a tf::Future
+  object that eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(taskflow, 2);  // run taskflow 2 times
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  tf::Future<void> run_n(Taskflow& taskflow, size_t N);
+
+  /**
+  @brief runs a moved taskflow for @c N times
+
+  @param taskflow a moved tf::Taskflow object
+  @param N number of runs
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow @c N times and returns a tf::Future
+  object that eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(
+    std::move(taskflow), 2    // run the moved taskflow 2 times
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  tf::Future<void> run_n(Taskflow&& taskflow, size_t N);
+
+  /**
+  @brief runs a taskflow for @c N times and then invokes a callback
+
+  @param taskflow a tf::Taskflow
+  @param N number of runs
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow @c N times and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(
+    taskflow, 2, [](){ std::cout << "done"; }  // runs taskflow 2 times and invoke
+                                               // the lambda to print "done"
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename C>
+  tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
+
+  /**
+  @brief runs a moved taskflow for @c N times and then invokes a callback
+
+  @param taskflow a moved tf::Taskflow
+  @param N number of runs
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow @c N times and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(
+    // run the moved taskflow 2 times and invoke the lambda to print "done"
+    std::move(taskflow), 2, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename C>
+  tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable);
+
+  /**
+  @brief runs a taskflow multiple times until the predicate becomes true
+
+  @param taskflow a tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow multiple times until
+  the predicate returns @c true.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    taskflow, [](){ return rand()%10 == 0 }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename P>
+  tf::Future<void> run_until(Taskflow& taskflow, P&& pred);
+
+  /**
+  @brief runs a moved taskflow and keeps running it
+         until the predicate becomes true
+
+  @param taskflow a moved tf::Taskflow object
+  @param pred a boolean predicate to return @c true for stop
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow multiple times until
+  the predicate returns @c true.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    std::move(taskflow), [](){ return rand()%10 == 0 }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename P>
+  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred);
+
+  /**
+  @brief runs a taskflow multiple times until the predicate becomes true and
+         then invokes the callback
+
+  @param taskflow a tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+  @param callable a callable object to be invoked after this run completes
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow multiple times until
+  the predicate returns @c true and then invokes the given callable when
+  the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename P, typename C>
+  tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
+
+  /**
+  @brief runs a moved taskflow and keeps running
+         it until the predicate becomes true and then invokes the callback
+
+  @param taskflow a moved tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+  @param callable a callable object to be invoked after this run completes
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow multiple times until
+  the predicate returns @c true and then invokes the given callable when
+  the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    std::move(taskflow),
+    [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename P, typename C>
+  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable);
+
+  /**
+  @brief runs a target graph and waits until it completes using 
+         an internal worker of this executor
+  
+  @tparam T target type which has `tf::Graph& T::graph()` defined
+  @param target the target task graph object
+
+  The method runs a target graph which has `tf::Graph& T::graph()` defined 
+  and waits until the execution completes.
+  Unlike the typical flow of calling `tf::Executor::run` series 
+  plus waiting on the result, this method must be called by an internal
+  worker of this executor. The caller worker will participate in
+  the work-stealing loop of the scheduler, therby avoiding potential
+  deadlock caused by blocked waiting.
+  
+  @code{.cpp}
+  tf::Executor executor(2);
+  tf::Taskflow taskflow;
+  std::array<tf::Taskflow, 1000> others;
+  
+  std::atomic<size_t> counter{0};
+  
+  for(size_t n=0; n<1000; n++) {
+    for(size_t i=0; i<1000; i++) {
+      others[n].emplace([&](){ counter++; });
+    }
+    taskflow.emplace([&executor, &tf=others[n]](){
+      executor.corun(tf);
+      //executor.run(tf).wait();  <- blocking the worker without doing anything
+      //                             will introduce deadlock
+    });
+  }
+  executor.run(taskflow).wait();
+  @endcode 
+
+  The method is thread-safe as long as the target is not concurrently
+  ran by two or more threads.
+
+  @attention
+  You must call tf::Executor::corun from a worker of the calling executor
+  or an exception will be thrown.
+  */
+  template <typename T>
+  void corun(T& target);
+
+  /**
+  @brief keeps running the work-stealing loop until the predicate becomes true
+  
+  @tparam P predicate type
+  @param predicate a boolean predicate to indicate when to stop the loop
+
+  The method keeps the caller worker running in the work-stealing loop
+  until the stop predicate becomes true.
+
+  @code{.cpp}
+  taskflow.emplace([&](){
+    std::future<void> fu = std::async([](){ std::sleep(100s); });
+    executor.corun_until([](){
+      return fu.wait_for(std::chrono::seconds(0)) == future_status::ready;
+    });
+  });
+  @endcode
+
+  @attention
+  You must call tf::Executor::corun_until from a worker of the calling executor
+  or an exception will be thrown.
+  */
+  template <typename P>
+  void corun_until(P&& predicate);
+
+  /**
+  @brief waits for all tasks to complete
+
+  This member function waits until all submitted tasks
+  (e.g., taskflows, asynchronous tasks) to finish.
+
+  @code{.cpp}
+  executor.run(taskflow1);
+  executor.run_n(taskflow2, 10);
+  executor.run_n(taskflow3, 100);
+  executor.wait_for_all();  // wait until the above submitted taskflows finish
+  @endcode
+  */
+  void wait_for_all();
+
+  /**
+  @brief queries the number of worker threads
+
+  Each worker represents one unique thread spawned by an executor
+  upon its construction time.
+
+  @code{.cpp}
+  tf::Executor executor(4);
+  std::cout << executor.num_workers();    // 4
+  @endcode
+  */
+  size_t num_workers() const noexcept;
+
+  /**
+  @brief queries the number of running topologies at the time of this call
+
+  When a taskflow is submitted to an executor, a topology is created to store
+  runtime metadata of the running taskflow.
+  When the execution of the submitted taskflow finishes,
+  its corresponding topology will be removed from the executor.
+
+  @code{.cpp}
+  executor.run(taskflow);
+  std::cout << executor.num_topologies();  // 0 or 1 (taskflow still running)
+  @endcode
+  */
+  size_t num_topologies() const;
+
+  /**
+  @brief queries the number of running taskflows with moved ownership
+
+  @code{.cpp}
+  executor.run(std::move(taskflow));
+  std::cout << executor.num_taskflows();  // 0 or 1 (taskflow still running)
+  @endcode
+  */
+  size_t num_taskflows() const;
+  
+  /**
+  @brief queries the id of the caller thread in this executor
+
+  Each worker has an unique id in the range of @c 0 to @c N-1 associated with
+  its parent executor.
+  If the caller thread does not belong to the executor, @c -1 is returned.
+
+  @code{.cpp}
+  tf::Executor executor(4);   // 4 workers in the executor
+  executor.this_worker_id();  // -1 (main thread is not a worker)
+
+  taskflow.emplace([&](){
+    std::cout << executor.this_worker_id();  // 0, 1, 2, or 3
+  });
+  executor.run(taskflow);
+  @endcode
+  */
+  int this_worker_id() const;
+ 
+  // --------------------------------------------------------------------------
+  // Observer methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @brief constructs an observer to inspect the activities of worker threads
+
+  @tparam Observer observer type derived from tf::ObserverInterface
+  @tparam ArgsT argument parameter pack
+
+  @param args arguments to forward to the constructor of the observer
+
+  @return a shared pointer to the created observer
+
+  Each executor manages a list of observers with shared ownership with callers.
+  For each of these observers, the two member functions,
+  tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit
+  will be called before and after the execution of a task.
+
+  This member function is not thread-safe.
+  */
+  template <typename Observer, typename... ArgsT>
+  std::shared_ptr<Observer> make_observer(ArgsT&&... args);
+
+  /**
+  @brief removes an observer from the executor
+
+  This member function is not thread-safe.
+  */
+  template <typename Observer>
+  void remove_observer(std::shared_ptr<Observer> observer);
+
+  /**
+  @brief queries the number of observers
+  */
+  size_t num_observers() const noexcept;
+
+  // --------------------------------------------------------------------------
+  // Async Task Methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @brief runs a given function asynchronously
+
+  @tparam F callable type
+
+  @param func callable object
+
+  @return a @std_future that will hold the result of the execution
+
+  The method creates an asynchronous task to run the given function
+  and return a @std_future object that eventually will hold the result
+  of the return value.
+
+  @code{.cpp}
+  std::future<int> future = executor.async([](){
+    std::cout << "create an asynchronous task and returns 1\n";
+    return 1;
+  });
+  future.get();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  auto async(F&& func);
+
+  /**
+  @brief runs a given function asynchronously and gives a name to this task
+
+  @tparam F callable type
+
+  @param name name of the asynchronous task
+  @param func callable object
+
+  @return a @std_future that will hold the result of the execution
+  
+  The method creates and assigns a name to an asynchronous task 
+  to run the given function, 
+  returning @std_future object that eventually will hold the result
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::future<int> future = executor.async("name", [](){
+    std::cout << "create an asynchronous task with a name and returns 1\n";
+    return 1;
+  });
+  future.get();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  auto async(const std::string& name, F&& func);
+
+  /**
+  @brief similar to tf::Executor::async but does not return a future object
+  
+  @tparam F callable type
+  
+  @param func callable object
+
+  This member function is more efficient than tf::Executor::async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+
+  @code{.cpp}
+  executor.silent_async([](){
+    std::cout << "create an asynchronous task with no return\n";
+  });
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(F&& func);
+
+  /**
+  @brief similar to tf::Executor::async but does not return a future object
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param func callable object
+
+  This member function is more efficient than tf::Executor::async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  executor.silent_async("name", [](){
+    std::cout << "create an asynchronous task with a name and no return\n";
+  });
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(const std::string& name, F&& func);
+
+  // --------------------------------------------------------------------------
+  // Silent Dependent Async Methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given dependents finish
+
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
+  executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B);
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param name assigned name to the task
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
+  executor.silent_dependent_async(
+    "C", [](){ printf("C runs after A and B\n"); }, A, B
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async([](){ printf("A\n"); }),
+    executor.silent_dependent_async([](){ printf("B\n"); })
+  };
+  executor.silent_dependent_async(
+    [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I, 
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(F&& func, I first, I last);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param name assigned name to the task
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
+    executor.silent_dependent_async("B", [](){ printf("B\n"); })
+  };
+  executor.silent_dependent_async(
+    "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I, 
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, I first, I last);
+  
+  // --------------------------------------------------------------------------
+  // Dependent Async Methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
+  auto [C, fuC] = executor.dependent_async(
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    A, B
+  );
+  fuC.get();  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  auto dependent_async(F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief names and runs the given function asynchronously
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+  
+  @param name assigned name to the task
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
+  auto [C, fuC] = executor.dependent_async(
+    "C",
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    A, B
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  auto dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async([](){ printf("A\n"); }),
+    executor.silent_dependent_async([](){ printf("B\n"); })
+  };
+  auto [C, fuC] = executor.dependent_async(
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    array.begin(), array.end()
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I,
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  auto dependent_async(F&& func, I first, I last);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+  
+  @param name assigned name to the task
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
+    executor.silent_dependent_async("B", [](){ printf("B\n"); })
+  };
+  auto [C, fuC] = executor.dependent_async(
+    "C",
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    array.begin(), array.end()
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I,
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  auto dependent_async(const std::string& name, F&& func, I first, I last);
+
+  private:
+    
+  const size_t _MAX_STEALS;
+  
+  std::mutex _wsq_mutex;
+  std::mutex _taskflows_mutex;
+
+#ifdef __cpp_lib_atomic_wait
+  std::atomic<size_t> _num_topologies {0};
+  std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT;
+#else
+  std::condition_variable _topology_cv;
+  std::mutex _topology_mutex;
+  size_t _num_topologies {0};
+#endif
+  
+  std::unordered_map<std::thread::id, size_t> _wids;
+  std::vector<std::thread> _threads;
+  std::vector<Worker> _workers;
+  std::list<Taskflow> _taskflows;
+
+  Notifier _notifier;
+
+  TaskQueue<Node*> _wsq;
+
+  std::atomic<bool> _done {0};
+
+  std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;
+
+  Worker* _this_worker();
+  
+  Node* _tear_down_invoke(Worker&, Node*);
+
+  bool _wait_for_task(Worker&, Node*&);
+  bool _invoke_module_task_internal(Worker&, Node*);
+
+  void _observer_prologue(Worker&, Node*);
+  void _observer_epilogue(Worker&, Node*);
+  void _spawn(size_t);
+  void _exploit_task(Worker&, Node*&);
+  void _explore_task(Worker&, Node*&);
+  void _schedule(Worker&, Node*);
+  void _schedule(Node*);
+  void _schedule(Worker&, const SmallVector<Node*>&);
+  void _schedule(const SmallVector<Node*>&);
+  void _set_up_topology(Worker*, Topology*);
+  void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector<Node*>&);
+  void _tear_down_topology(Worker&, Topology*);
+  void _tear_down_async(Node*);
+  void _tear_down_dependent_async(Worker&, Node*);
+  void _increment_topology();
+  void _decrement_topology();
+  void _invoke(Worker&, Node*);
+  void _invoke_static_task(Worker&, Node*);
+  void _invoke_dynamic_task(Worker&, Node*);
+  void _consume_graph(Worker&, Node*, Graph&);
+  void _detach_dynamic_task(Worker&, Node*, Graph&);
+  void _invoke_condition_task(Worker&, Node*, SmallVector<int>&);
+  void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&);
+  void _invoke_module_task(Worker&, Node*, bool&);
+  void _invoke_async_task(Worker&, Node*);
+  void _invoke_dependent_async_task(Worker&, Node*);
+  void _process_async_dependent(Node*, tf::AsyncTask&, size_t&);
+  void _process_exception(Worker&, Node*);
+  void _schedule_async_task(Node*);
+  
+  template <typename P>
+  void _corun_until(Worker&, P&&);
+};
+
+#ifdef TF_DISABLE_EXCEPTION_HANDLING
+
+#define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \
+    do { code_block; } while(0)
+#else
+
+#define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block)  \
+    try {                                            \
+        code_block;                                  \
+    } catch(...) {                                   \
+        _process_exception(worker, node);            \
+    }
+#endif
+
+
+// Constructor
+inline Executor::Executor(size_t N) :
+  _MAX_STEALS {((N+1) << 1)},
+  _threads    {N},
+  _workers    {N},
+  _notifier   {N} {
+
+  if(N == 0) {
+    TF_THROW("no cpu workers to execute taskflows");
+  }
+
+  _spawn(N);
+
+  // instantite the default observer if requested
+  if(has_env(TF_ENABLE_PROFILER)) {
+    TFProfManager::get()._manage(make_observer<TFProfObserver>());
+  }
+}
+
+// Destructor
+inline Executor::~Executor() {
+
+  // wait for all topologies to complete
+  wait_for_all();
+
+  // shut down the scheduler
+  _done = true;
+
+  _notifier.notify(true);
+
+  for(auto& t : _threads){
+    t.join();
+  }
+}
+
+// Function: num_workers
+inline size_t Executor::num_workers() const noexcept {
+  return _workers.size();
+}
+
+// Function: num_topologies
+inline size_t Executor::num_topologies() const {
+#ifdef __cpp_lib_atomic_wait
+  return _num_topologies.load(std::memory_order_relaxed);
+#else
+  return _num_topologies;
+#endif
+}
+
+// Function: num_taskflows
+inline size_t Executor::num_taskflows() const {
+  return _taskflows.size();
+}
+
+// Function: _this_worker
+inline Worker* Executor::_this_worker() {
+  auto itr = _wids.find(std::this_thread::get_id());
+  return itr == _wids.end() ? nullptr : &_workers[itr->second];
+}
+
+// Function: this_worker_id
+inline int Executor::this_worker_id() const {
+  auto i = _wids.find(std::this_thread::get_id());
+  return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id);
+}
+
+// Procedure: _spawn
+inline void Executor::_spawn(size_t N) {
+
+#ifdef __cpp_lib_atomic_wait
+#else
+  std::mutex mutex;
+  std::condition_variable cond;
+  size_t n=0;
+#endif
+
+  for(size_t id=0; id<N; ++id) {
+
+    _workers[id]._id = id;
+    _workers[id]._vtm = id;
+    _workers[id]._executor = this;
+    _workers[id]._waiter = &_notifier._waiters[id];
+
+    _threads[id] = std::thread([&, &w=_workers[id]] () {
+
+#ifdef __cpp_lib_atomic_wait
+      // wait for the caller thread to initialize the ID mapping
+      _all_spawned.wait(false, std::memory_order_acquire);
+      w._thread = &_threads[w._id];
+#else
+      // update the ID mapping of this thread
+      w._thread = &_threads[w._id];
+      {
+        std::scoped_lock lock(mutex);
+        _wids[std::this_thread::get_id()] = w._id;
+        if(n++; n == num_workers()) {
+          cond.notify_one();
+        }
+      }
+#endif
+
+      Node* t = nullptr;
+      
+      while(1) {
+
+        // execute the tasks.
+        _exploit_task(w, t);
+
+        // wait for tasks
+        if(_wait_for_task(w, t) == false) {
+          break;
+        }
+      }
+
+    });
+    
+    // POSIX-like system can use the following to affine threads to cores 
+    //cpu_set_t cpuset;
+    //CPU_ZERO(&cpuset);
+    //CPU_SET(id, &cpuset);
+    //pthread_setaffinity_np(
+    //  _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset
+    //);
+
+#ifdef __cpp_lib_atomic_wait
+    //_wids[_threads[id].get_id()] = id;
+    _wids.emplace(std::piecewise_construct,
+      std::forward_as_tuple(_threads[id].get_id()), std::forward_as_tuple(id)
+    );
+#endif
+  }
+  
+#ifdef __cpp_lib_atomic_wait
+  _all_spawned.test_and_set(std::memory_order_release);
+  _all_spawned.notify_all();
+#else
+  std::unique_lock<std::mutex> lock(mutex);
+  cond.wait(lock, [&](){ return n==N; });
+#endif
+}
+
+// Function: _corun_until
+template <typename P>
+void Executor::_corun_until(Worker& w, P&& stop_predicate) {
+  
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  exploit:
+
+  while(!stop_predicate()) {
+
+    //exploit:
+
+    if(auto t = w._wsq.pop(); t) {
+      _invoke(w, t);
+    }
+    else {
+      size_t num_steals = 0;
+
+      explore:
+
+      t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+
+      if(t) {
+        _invoke(w, t);
+        goto exploit;
+      }
+      else if(!stop_predicate()) {
+        if(num_steals++ > _MAX_STEALS) {
+          std::this_thread::yield();
+        }
+        w._vtm = rdvtm(w._rdgen);
+        goto explore;
+      }
+      else {
+        break;
+      }
+    }
+  }
+}
+
+// Function: _explore_task
+inline void Executor::_explore_task(Worker& w, Node*& t) {
+
+  //assert(_workers[w].wsq.empty());
+  //assert(!t);
+
+  size_t num_steals = 0;
+  size_t num_yields = 0;
+
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+  
+  // Here, we write do-while to make the worker steal at once
+  // from the assigned victim.
+  do {
+    t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+
+    if(t) {
+      break;
+    }
+
+    if(num_steals++ > _MAX_STEALS) {
+      std::this_thread::yield();
+      if(num_yields++ > 100) {
+        break;
+      }
+    }
+
+    w._vtm = rdvtm(w._rdgen);
+  } while(!_done);
+
+}
+
+// Procedure: _exploit_task
+inline void Executor::_exploit_task(Worker& w, Node*& t) {
+  while(t) {
+    _invoke(w, t);
+    t = w._wsq.pop();
+  }
+}
+
+// Function: _wait_for_task
+inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
+
+  explore_task:
+
+  _explore_task(worker, t);
+  
+  // The last thief who successfully stole a task will wake up
+  // another thief worker to avoid starvation.
+  if(t) {
+    _notifier.notify(false);
+    return true;
+  }
+
+  // ---- 2PC guard ----
+  _notifier.prepare_wait(worker._waiter);
+
+  if(!_wsq.empty()) {
+    _notifier.cancel_wait(worker._waiter);
+    worker._vtm = worker._id;
+    goto explore_task;
+  }
+  
+  if(_done) {
+    _notifier.cancel_wait(worker._waiter);
+    _notifier.notify(true);
+    return false;
+  }
+  
+  // We need to use index-based scanning to avoid data race
+  // with _spawn which may initialize a worker at the same time.
+  for(size_t vtm=0; vtm<_workers.size(); vtm++) {
+    if(!_workers[vtm]._wsq.empty()) {
+      _notifier.cancel_wait(worker._waiter);
+      worker._vtm = vtm;
+      goto explore_task;
+    }
+  }
+  
+  // Now I really need to relinguish my self to others
+  _notifier.commit_wait(worker._waiter);
+
+  goto explore_task;
+}
+
+// Function: make_observer
+template<typename Observer, typename... ArgsT>
+std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  // use a local variable to mimic the constructor
+  auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...);
+
+  ptr->set_up(_workers.size());
+
+  _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
+
+  return ptr;
+}
+
+// Procedure: remove_observer
+template <typename Observer>
+void Executor::remove_observer(std::shared_ptr<Observer> ptr) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
+}
+
+// Function: num_observers
+inline size_t Executor::num_observers() const noexcept {
+  return _observers.size();
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Worker& worker, Node* node) {
+  
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = node->_priority;
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  // caller is a worker to this pool - starting at v3.5 we do not use
+  // any complicated notification mechanism as the experimental result
+  // has shown no significant advantage.
+  if(worker._executor == this) {
+    worker._wsq.push(node, p);
+    _notifier.notify(false);
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node, p);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Node* node) {
+  
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = node->_priority;
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node, p);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes) {
+
+  // We need to cacth the node count to avoid accessing the nodes
+  // vector while the parent topology is removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // caller is a worker to this pool - starting at v3.5 we do not use
+  // any complicated notification mechanism as the experimental result
+  // has shown no significant advantage.
+  if(worker._executor == this) {
+    for(size_t i=0; i<num_nodes; ++i) {
+      // We need to fetch p before the release such that the read 
+      // operation is synchronized properly with other thread to
+      // void data race.
+      auto p = nodes[i]->_priority;
+      nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+      worker._wsq.push(nodes[i], p);
+      _notifier.notify(false);
+    }
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      auto p = nodes[k]->_priority;
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
+
+  // parent topology may be removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      auto p = nodes[k]->_priority;
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _invoke
+inline void Executor::_invoke(Worker& worker, Node* node) {
+
+  // synchronize all outstanding memory operations caused by reordering
+  while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
+
+  begin_invoke:
+  
+  SmallVector<int> conds;
+
+  // no need to do other things if the topology is cancelled
+  if(node->_is_cancelled()) {
+    if(node = _tear_down_invoke(worker, node); node) {
+      goto invoke_successors;
+    }
+    return;
+  }
+
+  // if acquiring semaphore(s) exists, acquire them first
+  if(node->_semaphores && !node->_semaphores->to_acquire.empty()) {
+    SmallVector<Node*> nodes;
+    if(!node->_acquire_all(nodes)) {
+      _schedule(worker, nodes);
+      return;
+    }
+    node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release);
+  }
+
+  // condition task
+  //int cond = -1;
+
+  // switch is faster than nested if-else due to jump table
+  switch(node->_handle.index()) {
+    // static task
+    case Node::STATIC:{
+      _invoke_static_task(worker, node);
+    }
+    break;
+
+    // dynamic task
+    case Node::DYNAMIC: {
+      _invoke_dynamic_task(worker, node);
+    }
+    break;
+
+    // condition task
+    case Node::CONDITION: {
+      _invoke_condition_task(worker, node, conds);
+    }
+    break;
+
+    // multi-condition task
+    case Node::MULTI_CONDITION: {
+      _invoke_multi_condition_task(worker, node, conds);
+    }
+    break;
+
+    // module task
+    case Node::MODULE: {
+      bool spawned;
+      _invoke_module_task(worker, node, spawned);
+      if(spawned) {
+        return;
+      }
+    }
+    break;
+
+    // async task
+    case Node::ASYNC: {
+      _invoke_async_task(worker, node);
+      _tear_down_async(node);
+      return ;
+    }
+    break;
+
+    // dependent async task
+    case Node::DEPENDENT_ASYNC: {
+      _invoke_dependent_async_task(worker, node);
+      _tear_down_dependent_async(worker, node);
+      if(worker._cache) {
+        node = worker._cache;
+        goto begin_invoke;
+      }
+      return;
+    }
+    break;
+
+    // monostate (placeholder)
+    default:
+    break;
+  }
+
+  invoke_successors:
+
+  // if releasing semaphores exist, release them
+  if(node->_semaphores && !node->_semaphores->to_release.empty()) {
+    _schedule(worker, node->_release_all());
+  }
+  
+  // Reset the join counter to support the cyclic control flow.
+  // + We must do this before scheduling the successors to avoid race
+  //   condition on _dependents.
+  // + We must use fetch_add instead of direct assigning
+  //   because the user-space call on "invoke" may explicitly schedule 
+  //   this task again (e.g., pipeline) which can access the join_counter.
+  if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) {
+    node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed);
+  }
+  else {
+    node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed);
+  }
+
+  // acquire the parent flow counter
+  auto& j = (node->_parent) ? node->_parent->_join_counter :
+                              node->_topology->_join_counter;
+
+  // Here, we want to cache the latest successor with the highest priority
+  worker._cache = nullptr;
+  auto max_p = static_cast<unsigned>(TaskPriority::MAX);
+
+  // Invoke the task based on the corresponding type
+  switch(node->_handle.index()) {
+
+    // condition and multi-condition tasks
+    case Node::CONDITION:
+    case Node::MULTI_CONDITION: {
+      for(auto cond : conds) {
+        if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) {
+          auto s = node->_successors[cond];
+          // zeroing the join counter for invariant
+          s->_join_counter.store(0, std::memory_order_relaxed);
+          j.fetch_add(1, std::memory_order_relaxed);
+          if(s->_priority <= max_p) {
+            if(worker._cache) {
+              _schedule(worker, worker._cache);
+            }
+            worker._cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
+          }
+        }
+      }
+    }
+    break;
+
+    // non-condition task
+    default: {
+      for(size_t i=0; i<node->_successors.size(); ++i) {
+        //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
+        if(auto s = node->_successors[i]; 
+          s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+          j.fetch_add(1, std::memory_order_relaxed);
+          if(s->_priority <= max_p) {
+            if(worker._cache) {
+              _schedule(worker, worker._cache);
+            }
+            worker._cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  // tear_down the invoke
+  if(node = _tear_down_invoke(worker, node); node) {
+    goto invoke_successors;
+  }
+
+  // perform tail recursion elimination for the right-most child to reduce
+  // the number of expensive pop/push operations through the task queue
+  if(worker._cache) {
+    node = worker._cache;
+    //node->_state.fetch_or(Node::READY, std::memory_order_release);
+    goto begin_invoke;
+  }
+}
+
+// Proecdure: _tear_down_invoke
+inline Node* Executor::_tear_down_invoke(Worker& worker, Node* node) {
+  // we must check parent first before substracting the join counter,
+  // or it can introduce data race
+  if(auto parent = node->_parent; parent == nullptr) {
+    if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      _tear_down_topology(worker, node->_topology);
+    }
+  }
+  // module task
+  else {  
+    auto id = parent->_handle.index();
+    if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      if(id == Node::MODULE) {
+        return parent;
+      }
+    }
+  }
+  return nullptr;
+}
+
+// Procedure: _observer_prologue
+inline void Executor::_observer_prologue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_entry(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _observer_epilogue
+inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_exit(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _process_exception
+inline void Executor::_process_exception(Worker&, Node* node) {
+
+  constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED;
+
+  // multiple tasks may throw, so we only take the first thrown exception
+  if(auto tpg = node->_topology; tpg && 
+    ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0)
+  ) {
+    tpg->_exception = std::current_exception();
+  }
+  // TODO: skip the exception that is not associated with any taskflows
+}
+
+// Procedure: _invoke_static_task
+inline void Executor::_invoke_static_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, {
+    auto& work = std::get_if<Node::Static>(&node->_handle)->work;
+    switch(work.index()) {
+      case 0:
+        std::get_if<0>(&work)->operator()();
+      break;
+
+      case 1:
+        Runtime rt(*this, worker, node);
+        std::get_if<1>(&work)->operator()(rt);
+      break;
+    }
+  });
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_dynamic_task
+inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) {
+
+  _observer_prologue(w, node);
+
+  auto handle = std::get_if<Node::Dynamic>(&node->_handle);
+
+  handle->subgraph._clear();
+
+  Subflow sf(*this, w, node, handle->subgraph);
+
+  TF_EXECUTOR_EXCEPTION_HANDLER(w, node, {
+    handle->work(sf);
+  });
+
+  if(sf._joinable) {
+    _consume_graph(w, node, handle->subgraph);
+  }
+
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _detach_dynamic_task
+inline void Executor::_detach_dynamic_task(Worker& w, Node* p, Graph& g) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+  _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src);
+
+  {
+    std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex);
+    p->_topology->_taskflow._graph._merge(std::move(g));
+  }
+
+  p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
+  _schedule(w, src);
+}
+
+// Procedure: _consume_graph
+inline void Executor::_consume_graph(Worker& w, Node* p, Graph& g) {
+
+  // graph is empty and has no async tasks (subflow)
+  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  _set_up_graph(g, p, p->_topology, 0, src);
+  p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
+  
+  _schedule(w, src);
+
+  _corun_until(w, [p] () -> bool { 
+    return p->_join_counter.load(std::memory_order_acquire) == 0; }
+  );
+}
+
+// Procedure: _invoke_condition_task
+inline void Executor::_invoke_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, {
+    auto& work = std::get_if<Node::Condition>(&node->_handle)->work;
+    switch(work.index()) {
+      case 0:
+        conds = { std::get_if<0>(&work)->operator()() };
+      break;
+
+      case 1:
+        Runtime rt(*this, worker, node);
+        conds = { std::get_if<1>(&work)->operator()(rt) };
+      break;
+    }
+  });
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_multi_condition_task
+inline void Executor::_invoke_multi_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, {
+    auto& work = std::get_if<Node::MultiCondition>(&node->_handle)->work;
+    switch(work.index()) {
+      case 0:
+        conds = std::get_if<0>(&work)->operator()();
+      break;
+
+      case 1:
+        Runtime rt(*this, worker, node);
+        conds = std::get_if<1>(&work)->operator()(rt);
+      break;
+    }
+  });
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_module_task
+inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& spawned) {
+  _observer_prologue(w, node);
+  spawned = _invoke_module_task_internal(w, node);
+  _observer_epilogue(w, node);
+}
+
+// Function: _invoke_module_task_internal
+inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) {
+  
+  // acquire the underlying graph
+  auto& g = std::get_if<Node::Module>(&p->_handle)->graph;
+
+  // no need to do anything if the graph is empty
+  if(g.empty()) {
+    return false;
+  }
+
+  SmallVector<Node*> src;
+  _set_up_graph(g, p, p->_topology, 0, src);
+  p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
+
+  _schedule(w, src);
+  return true;
+}
+
+// Procedure: _invoke_async_task
+inline void Executor::_invoke_async_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, {
+    auto& work = std::get_if<Node::Async>(&node->_handle)->work;
+    switch(work.index()) {
+      case 0:
+        std::get_if<0>(&work)->operator()();
+      break;
+
+      case 1:
+        Runtime rt(*this, worker, node);
+        std::get_if<1>(&work)->operator()(rt);
+      break;
+    }
+  });
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_dependent_async_task
+inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, {
+    auto& work = std::get_if<Node::DependentAsync>(&node->_handle)->work;
+    switch(work.index()) {
+      case 0:
+        std::get_if<0>(&work)->operator()();
+      break;
+
+      case 1:
+        Runtime rt(*this, worker, node);
+        std::get_if<1>(&work)->operator()(rt);
+      break;
+    }
+  });
+  _observer_epilogue(worker, node);
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow& f) {
+  return run_n(f, 1, [](){});
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow&& f) {
+  return run_n(std::move(f), 1, [](){});
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow& f, C&& c) {
+  return run_n(f, 1, std::forward<C>(c));
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow&& f, C&& c) {
+  return run_n(std::move(f), 1, std::forward<C>(c));
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) {
+  return run_n(f, repeat, [](){});
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) {
+  return run_n(std::move(f), repeat, [](){});
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
+  return run_until(
+    f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) {
+  return run_until(
+    std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) {
+  return run_until(f, std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) {
+  return run_until(std::move(f), std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) {
+
+  _increment_topology();
+
+  // Need to check the empty under the lock since dynamic task may
+  // define detached blocks that modify the taskflow at the same time
+  bool empty;
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    empty = f.empty();
+  }
+
+  // No need to create a real topology but returns an dummy future
+  if(empty || p()) {
+    c();
+    std::promise<void> promise;
+    promise.set_value();
+    _decrement_topology();
+    return tf::Future<void>(promise.get_future());
+  }
+
+  // create a topology for this run
+  auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c));
+
+  // need to create future before the topology got torn down quickly
+  tf::Future<void> future(t->_promise.get_future(), t);
+
+  // modifying topology needs to be protected under the lock
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    f._topologies.push(t);
+    if(f._topologies.size() == 1) {
+      _set_up_topology(_this_worker(), t.get());
+    }
+  }
+
+  return future;
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) {
+
+  std::list<Taskflow>::iterator itr;
+
+  {
+    std::scoped_lock<std::mutex> lock(_taskflows_mutex);
+    itr = _taskflows.emplace(_taskflows.end(), std::move(f));
+    itr->_satellite = itr;
+  }
+
+  return run_until(*itr, std::forward<P>(pred), std::forward<C>(c));
+}
+
+// Function: corun
+template <typename T>
+void Executor::corun(T& target) {
+  
+  auto w = _this_worker();
+
+  if(w == nullptr) {
+    TF_THROW("corun must be called by a worker of the executor");
+  }
+
+  Node parent;  // dummy parent
+  _consume_graph(*w, &parent, target.graph());
+}
+
+// Function: corun_until
+template <typename P>
+void Executor::corun_until(P&& predicate) {
+  
+  auto w = _this_worker();
+
+  if(w == nullptr) {
+    TF_THROW("corun_until must be called by a worker of the executor");
+  }
+
+  _corun_until(*w, std::forward<P>(predicate));
+}
+
+// Procedure: _increment_topology
+inline void Executor::_increment_topology() {
+#ifdef __cpp_lib_atomic_wait
+  _num_topologies.fetch_add(1, std::memory_order_relaxed);
+#else
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  ++_num_topologies;
+#endif
+}
+
+// Procedure: _decrement_topology
+inline void Executor::_decrement_topology() {
+#ifdef __cpp_lib_atomic_wait
+  if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+    _num_topologies.notify_all();
+  }
+#else
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  if(--_num_topologies == 0) {
+    _topology_cv.notify_all();
+  }
+#endif
+}
+
+// Procedure: wait_for_all
+inline void Executor::wait_for_all() {
+#ifdef __cpp_lib_atomic_wait
+  size_t n = _num_topologies.load(std::memory_order_acquire);
+  while(n != 0) {
+    _num_topologies.wait(n, std::memory_order_acquire);
+    n = _num_topologies.load(std::memory_order_acquire);
+  }
+#else
+  std::unique_lock<std::mutex> lock(_topology_mutex);
+  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
+#endif
+}
+
+// Function: _set_up_topology
+inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) {
+
+  // ---- under taskflow lock ----
+
+  tpg->_sources.clear();
+  tpg->_taskflow._graph._clear_detached();
+  _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources);
+  tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
+
+  if(worker) {
+    _schedule(*worker, tpg->_sources);
+  }
+  else {
+    _schedule(tpg->_sources);
+  }
+}
+
+// Function: _set_up_graph
+inline void Executor::_set_up_graph(
+  Graph& g, Node* parent, Topology* tpg, int state, SmallVector<Node*>& src
+) {
+  for(auto node : g._nodes) {
+    node->_topology = tpg;
+    node->_parent = parent;
+    node->_state.store(state, std::memory_order_relaxed);
+    if(node->num_dependents() == 0) {
+      src.push_back(node);
+    }
+    node->_set_up_join_counter();
+  }
+}
+
+// Function: _tear_down_topology
+inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) {
+
+  auto &f = tpg->_taskflow;
+
+  //assert(&tpg == &(f._topologies.front()));
+
+  // case 1: we still need to run the topology again
+  if(!tpg->_exception && 
+     !(tpg->_state.load(std::memory_order_relaxed) & Topology::CANCELLED) && 
+     !tpg->_pred()
+  ) {
+    //assert(tpg->_join_counter == 0);
+    std::lock_guard<std::mutex> lock(f._mutex);
+    tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
+    _schedule(worker, tpg->_sources);
+  }
+  // case 2: the final run of this topology
+  else {
+
+    // TODO: if the topology is cancelled, need to release all semaphores
+    if(tpg->_call != nullptr) {
+      tpg->_call();
+    }
+
+    // If there is another run (interleave between lock)
+    if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) {
+      //assert(tpg->_join_counter == 0);
+
+      // Set the promise
+      tpg->_promise.set_value();
+      f._topologies.pop();
+      tpg = f._topologies.front().get();
+
+      // decrement the topology but since this is not the last we don't notify
+      _decrement_topology();
+
+      // set up topology needs to be under the lock or it can
+      // introduce memory order error with pop
+      _set_up_topology(&worker, tpg);
+    }
+    else {
+      //assert(f._topologies.size() == 1);
+
+      auto fetched_tpg {std::move(f._topologies.front())};
+      f._topologies.pop();
+      auto satellite {f._satellite};
+
+      lock.unlock();
+      
+      // Soon after we carry out the promise, there is no longer any guarantee
+      // for the lifetime of the associated taskflow.
+      fetched_tpg->_carry_out_promise();
+
+      _decrement_topology();
+
+      // remove the taskflow if it is managed by the executor
+      // TODO: in the future, we may need to synchronize on wait
+      // (which means the following code should the moved before set_value)
+      if(satellite) {
+        std::scoped_lock<std::mutex> satellite_lock(_taskflows_mutex);
+        _taskflows.erase(*satellite);
+      }
+    }
+  }
+}
+
+// ############################################################################
+// Forward Declaration: Subflow
+// ############################################################################
+
+inline void Subflow::join() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow not joinable");
+  }
+
+  // only the parent worker can join the subflow
+  _executor._consume_graph(_worker, _parent, _graph);
+  _joinable = false;
+}
+
+inline void Subflow::detach() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow already joined or detached");
+  }
+
+  // only the parent worker can detach the subflow
+  _executor._detach_dynamic_task(_worker, _parent, _graph);
+  _joinable = false;
+}
+
+// ############################################################################
+// Forward Declaration: Runtime
+// ############################################################################
+
+// Procedure: schedule
+inline void Runtime::schedule(Task task) {
+  
+  auto node = task._node;
+  // need to keep the invariant: when scheduling a task, the task must have
+  // zero dependency (join counter is 0)
+  // or we can encounter bug when inserting a nested flow (e.g., module task)
+  node->_join_counter.store(0, std::memory_order_relaxed);
+
+  auto& j = node->_parent ? node->_parent->_join_counter :
+                            node->_topology->_join_counter;
+  j.fetch_add(1, std::memory_order_relaxed);
+  _executor._schedule(_worker, node);
+}
+
+// Procedure: corun
+template <typename T>
+void Runtime::corun(T&& target) {
+
+  // dynamic task (subflow)
+  if constexpr(is_dynamic_task_v<T>) {
+    Graph graph;
+    Subflow sf(_executor, _worker, _parent, graph);
+    target(sf);
+    if(sf._joinable) {
+      _executor._consume_graph(_worker, _parent, graph);
+    }
+  }
+  // a composable graph object with `tf::Graph& T::graph()` defined
+  else {
+    _executor._consume_graph(_worker, _parent, target.graph());
+  }
+}
+
+// Procedure: corun_until
+template <typename P>
+void Runtime::corun_until(P&& predicate) {
+  _executor._corun_until(_worker, std::forward<P>(predicate));
+}
+
+// Function: _silent_async
+template <typename F>
+void Runtime::_silent_async(Worker& w, const std::string& name, F&& f) {
+
+  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);
+
+  auto node = node_pool.animate(
+    name, 0, _parent->_topology, _parent, 0,
+    std::in_place_type_t<Node::Async>{}, std::forward<F>(f)
+  );
+
+  _executor._schedule(w, node);
+}
+
+// Function: silent_async
+template <typename F>
+void Runtime::silent_async(F&& f) {
+  _silent_async(*_executor._this_worker(), "", std::forward<F>(f));
+}
+
+// Function: silent_async
+template <typename F>
+void Runtime::silent_async(const std::string& name, F&& f) {
+  _silent_async(*_executor._this_worker(), name, std::forward<F>(f));
+}
+
+// Function: silent_async_unchecked
+template <typename F>
+void Runtime::silent_async_unchecked(const std::string& name, F&& f) {
+  _silent_async(_worker, name, std::forward<F>(f));
+}
+
+// Function: _async
+template <typename F>
+auto Runtime::_async(Worker& w, const std::string& name, F&& f) {
+
+  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);
+
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::packaged_task<R()> p(std::forward<F>(f));
+  auto fu{p.get_future()};
+
+  auto node = node_pool.animate(
+    name, 0, _parent->_topology, _parent, 0, std::in_place_type_t<Node::Async>{},
+    [p=make_moc(std::move(p))] () mutable { p.object(); }
+  );
+
+  _executor._schedule(w, node);
+
+  return fu;
+}
+
+// Function: async
+template <typename F>
+auto Runtime::async(F&& f) {
+  return _async(*_executor._this_worker(), "", std::forward<F>(f));
+}
+
+// Function: async
+template <typename F>
+auto Runtime::async(const std::string& name, F&& f) {
+  return _async(*_executor._this_worker(), name, std::forward<F>(f));
+}
+
+// Function: corun_all
+inline void Runtime::corun_all() {
+  corun_until([this] () -> bool { 
+    return _parent->_join_counter.load(std::memory_order_acquire) == 0; 
+  });
+}
+
+// Destructor
+inline Runtime::~Runtime() {
+  if(_parent->_join_counter.load(std::memory_order_acquire)) {
+    corun_all();
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/core/flow_builder.hpp b/myxpcs/include/taskflow_/core/flow_builder.hpp
new file mode 100644
index 0000000..f4259dc
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/flow_builder.hpp
@@ -0,0 +1,1399 @@
+#pragma once
+
+#include "task.hpp"
+#include "../algorithm/partitioner.hpp"
+
+/**
+@file flow_builder.hpp
+@brief flow builder include file
+*/
+
+namespace tf {
+
+/**
+@class FlowBuilder
+
+@brief class to build a task dependency graph
+
+The class provides essential methods to construct a task dependency graph
+from which tf::Taskflow and tf::Subflow are derived.
+
+*/
+class FlowBuilder {
+
+  friend class Executor;
+
+  public:
+
+  /**
+  @brief constructs a flow builder with a graph
+  */
+  FlowBuilder(Graph& graph);
+
+  /**
+  @brief creates a static task
+
+  @tparam C callable type constructible from std::function<void()>
+
+  @param callable callable to construct a static task
+
+  @return a tf::Task handle
+
+  The following example creates a static task.
+
+  @code{.cpp}
+  tf::Task static_task = taskflow.emplace([](){});
+  @endcode
+
+  Please refer to @ref StaticTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_static_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
+
+  /**
+  @brief creates a dynamic task
+
+  @tparam C callable type constructible from std::function<void(tf::Subflow&)>
+
+  @param callable callable to construct a dynamic task
+
+  @return a tf::Task handle
+
+  The following example creates a dynamic task (tf::Subflow)
+  that spawns two static tasks.
+
+  @code{.cpp}
+  tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){
+    tf::Task static_task1 = sf.emplace([](){});
+    tf::Task static_task2 = sf.emplace([](){});
+  });
+  @endcode
+
+  Please refer to @ref DynamicTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_dynamic_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
+
+  /**
+  @brief creates a condition task
+
+  @tparam C callable type constructible from std::function<int()>
+
+  @param callable callable to construct a condition task
+
+  @return a tf::Task handle
+
+  The following example creates an if-else block using one condition task
+  and three static tasks.
+
+  @code{.cpp}
+  tf::Taskflow taskflow;
+
+  auto [init, cond, yes, no] = taskflow.emplace(
+    [] () { },
+    [] () { return 0; },
+    [] () { std::cout << "yes\n"; },
+    [] () { std::cout << "no\n"; }
+  );
+
+  // executes yes if cond returns 0, or no if cond returns 1
+  cond.precede(yes, no);
+  cond.succeed(init);
+  @endcode
+
+  Please refer to @ref ConditionalTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_condition_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
+
+  /**
+  @brief creates a multi-condition task
+
+  @tparam C callable type constructible from
+          std::function<tf::SmallVector<int>()>
+
+  @param callable callable to construct a multi-condition task
+
+  @return a tf::Task handle
+
+  The following example creates a multi-condition task that selectively
+  jumps to two successor tasks.
+
+  @code{.cpp}
+  tf::Taskflow taskflow;
+
+  auto [init, cond, branch1, branch2, branch3] = taskflow.emplace(
+    [] () { },
+    [] () { return tf::SmallVector{0, 2}; },
+    [] () { std::cout << "branch1\n"; },
+    [] () { std::cout << "branch2\n"; },
+    [] () { std::cout << "branch3\n"; }
+  );
+
+  // executes branch1 and branch3 when cond returns 0 and 2
+  cond.precede(branch1, branch2, branch3);
+  cond.succeed(init);
+  @endcode
+
+  Please refer to @ref ConditionalTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_multi_condition_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
+
+  /**
+  @brief creates multiple tasks from a list of callable objects
+
+  @tparam C callable types
+
+  @param callables one or multiple callable objects constructible from each task category
+
+  @return a tf::Task handle
+
+  The method returns a tuple of tasks each corresponding to the given
+  callable target. You can use structured binding to get the return tasks
+  one by one.
+  The following example creates four static tasks and assign them to
+  @c A, @c B, @c C, and @c D using structured binding.
+
+  @code{.cpp}
+  auto [A, B, C, D] = taskflow.emplace(
+    [] () { std::cout << "A"; },
+    [] () { std::cout << "B"; },
+    [] () { std::cout << "C"; },
+    [] () { std::cout << "D"; }
+  );
+  @endcode
+  */
+  template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>* = nullptr>
+  auto emplace(C&&... callables);
+
+  /**
+  @brief removes a task from a taskflow
+
+  @param task task to remove
+
+  Removes a task and its input and output dependencies from the graph
+  associated with the flow builder.
+  If the task does not belong to the graph, nothing will happen.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  A.precede(B, C, D);
+
+  // erase A from the taskflow and its dependencies to B, C, and D
+  taskflow.erase(A);
+  @endcode
+  */
+  void erase(Task task);
+
+  /**
+  @brief creates a module task for the target object
+
+  @tparam T target object type
+  @param object a custom object that defines the method @c T::graph()
+
+  @return a tf::Task handle
+
+  The example below demonstrates a taskflow composition using
+  the @c composed_of method.
+
+  @code{.cpp}
+  tf::Taskflow t1, t2;
+  t1.emplace([](){ std::cout << "t1"; });
+
+  // t2 is partially composed of t1
+  tf::Task comp = t2.composed_of(t1);
+  tf::Task init = t2.emplace([](){ std::cout << "t2"; });
+  init.precede(comp);
+  @endcode
+
+  The taskflow object @c t2 is composed of another taskflow object @c t1,
+  preceded by another static task @c init.
+  When taskflow @c t2 is submitted to an executor,
+  @c init will run first and then @c comp which spwans its definition
+  in taskflow @c t1.
+
+  The target @c object being composed must define the method
+  <tt>T::graph()</tt> that returns a reference to a graph object of
+  type tf::Graph such that it can interact with the executor.
+  For example:
+
+  @code{.cpp}
+  // custom struct
+  struct MyObj {
+    tf::Graph graph;
+    MyObj() {
+      tf::FlowBuilder builder(graph);
+      tf::Task task = builder.emplace([](){
+        std::cout << "a task\n";  // static task
+      });
+    }
+    Graph& graph() { return graph; }
+  };
+
+  MyObj obj;
+  tf::Task comp = taskflow.composed_of(obj);
+  @endcode
+
+  Please refer to @ref ComposableTasking for details.
+  */
+  template <typename T>
+  Task composed_of(T& object);
+
+  /**
+  @brief creates a placeholder task
+
+  @return a tf::Task handle
+
+  A placeholder task maps to a node in the taskflow graph, but
+  it does not have any callable work assigned yet.
+  A placeholder task is different from an empty task handle that
+  does not point to any node in a graph.
+
+  @code{.cpp}
+  // create a placeholder task with no callable target assigned
+  tf::Task placeholder = taskflow.placeholder();
+  assert(placeholder.empty() == false && placeholder.has_work() == false);
+
+  // create an empty task handle
+  tf::Task task;
+  assert(task.empty() == true);
+
+  // assign the task handle to the placeholder task
+  task = placeholder;
+  assert(task.empty() == false && task.has_work() == false);
+  @endcode
+  */
+  Task placeholder();
+
+  /**
+  @brief adds adjacent dependency links to a linear list of tasks
+
+  @param tasks a vector of tasks
+
+  This member function creates linear dependencies over a vector of tasks.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  std::vector<tf::Task> tasks {A, B, C, D}
+  taskflow.linearize(tasks);  // A->B->C->D
+  @endcode
+
+  */
+  void linearize(std::vector<Task>& tasks);
+
+  /**
+  @brief adds adjacent dependency links to a linear list of tasks
+
+  @param tasks an initializer list of tasks
+
+  This member function creates linear dependencies over a list of tasks.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  taskflow.linearize({A, B, C, D});  // A->B->C->D
+  @endcode
+  */
+  void linearize(std::initializer_list<Task> tasks);
+
+  // ------------------------------------------------------------------------
+  // parallel iterations
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel-for task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param callable callable object to apply to the dereferenced iterator
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to each object
+  obtained by dereferencing every iterator in the range <tt>[first, last)</tt>.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    callable(*itr);
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of
+  the dereferenced iterator type.
+
+  Please refer to @ref ParallelIterations for details.
+  */
+  template <typename B, typename E, typename C, typename P = GuidedPartitioner>
+  Task for_each(B first, E last, C callable, P&& part = P());
+  
+  /**
+  @brief constructs an STL-styled index-based parallel-for task 
+
+  @tparam B beginning index type (must be integral)
+  @tparam E ending index type (must be integral)
+  @tparam S step type (must be integral)
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first index of the beginning (inclusive)
+  @param last index of the end (exclusive)
+  @param step step size
+  @param callable callable object to apply to each valid index
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to each index
+  in the range <tt>[first, last)</tt> with the step size.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  // case 1: step size is positive
+  for(auto i=first; i<last; i+=step) {
+    callable(i);
+  }
+
+  // case 2: step size is negative
+  for(auto i=first, i>last; i+=step) {
+    callable(i);
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of the integral index type.
+
+  Please refer to @ref ParallelIterations for details.
+  */
+  template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner>
+  Task for_each_index(
+    B first, E last, S step, C callable, P&& part = P()
+  );
+
+  // ------------------------------------------------------------------------
+  // transform
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs a parallel-transform task
+
+  @tparam B beginning input iterator type
+  @tparam E ending input iterator type
+  @tparam O output iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first1 iterator to the beginning of the first range
+  @param last1 iterator to the end of the first range
+  @param d_first iterator to the beginning of the output range
+  @param c an unary callable to apply to dereferenced input elements
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to an
+  input range and stores the result in another output range.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  while (first1 != last1) {
+    *d_first++ = c(*first1++);
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of the dereferenced
+  iterator type.
+  
+  Please refer to @ref ParallelTransforms for details.
+  */
+  template <
+    typename B, typename E, typename O, typename C, typename P = GuidedPartitioner,
+    std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr
+  >
+  Task transform(B first1, E last1, O d_first, C c, P&& part = P());
+  
+  /**
+  @brief constructs a parallel-transform task
+
+  @tparam B1 beginning input iterator type for the first input range
+  @tparam E1 ending input iterator type for the first input range
+  @tparam B2 beginning input iterator type for the first second range
+  @tparam O output iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first1 iterator to the beginning of the first input range
+  @param last1 iterator to the end of the first input range
+  @param first2 iterator to the beginning of the second input range
+  @param d_first iterator to the beginning of the output range
+  @param c a binary operator to apply to dereferenced input elements
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to two
+  input ranges and stores the result in another output range.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  while (first1 != last1) {
+    *d_first++ = c(*first1++, *first2++);
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take two arguments of dereferenced elements
+  from the two input ranges.
+  
+  Please refer to @ref ParallelTransforms for details.
+  */
+  template <
+    typename B1, typename E1, typename B2, typename O, typename C, typename P=GuidedPartitioner,
+    std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr
+  >
+  Task transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P());
+  
+  // ------------------------------------------------------------------------
+  // reduction
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel-reduce task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T result type
+  @tparam O binary reducer type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param init initial value of the reduction and the storage for the reduced result
+  @param bop binary operator that will be applied
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks to perform parallel reduction over @c init
+  and the elements in the range <tt>[first, last)</tt>.
+  The reduced result is store in @c init.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    init = bop(init, *itr);
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelReduction for details.
+  */
+  template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner>
+  Task reduce(B first, E last, T& init, O bop, P&& part = P());
+  
+  // ------------------------------------------------------------------------
+  // transfrom and reduction
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel transform-reduce task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T result type
+  @tparam BOP binary reducer type
+  @tparam UOP unary transformion type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param init initial value of the reduction and the storage for the reduced result
+  @param bop binary operator that will be applied in unspecified order to the results of @c uop
+  @param uop unary operator that will be applied to transform each element in the range to the result type
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks to perform parallel reduction over @c init and
+  the transformed elements in the range <tt>[first, last)</tt>.
+  The reduced result is store in @c init.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    init = bop(init, uop(*itr));
+  }
+  @endcode
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelReduction for details.
+  */
+  template <
+    typename B, typename E, typename T, typename BOP, typename UOP, typename P = GuidedPartitioner,
+    std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr
+  >
+  Task transform_reduce(B first, E last, T& init, BOP bop, UOP uop, P&& part = P());
+
+  /**
+  @brief constructs an STL-styled parallel transform-reduce task
+  @tparam B1 first beginning iterator type
+  @tparam E1 first ending iterator type
+  @tparam B2 second beginning iterator type
+  @tparam T result type
+  @tparam BOP_R binary reducer type
+  @tparam BOP_T binary transformion type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+ 
+  @param first1 iterator to the beginning of the first range (inclusive)
+  @param last1 iterator to the end of the first range (exclusive)
+  @param first2 iterator to the beginning of the second range
+  @param init initial value of the reduction and the storage for the reduced result
+  @param bop_r binary operator that will be applied in unspecified order to the results of @c bop_t
+  @param bop_t binary operator that will be applied to transform each element in the range to the result type
+  @param part partitioning algorithm to schedule parallel iterations
+ 
+  @return a tf::Task handle
+ 
+  The task spawns asynchronous tasks to perform parallel reduction over @c init and
+  the transformed elements in the range <tt>[first, last)</tt>.
+  The reduced result is store in @c init.
+  This method is equivalent to the parallel execution of the following loop:
+ 
+  @code{.cpp}
+  for(auto itr1=first1, itr2=first2; itr1!=last1; itr1++, itr2++) {
+    init = bop_r(init, bop_t(*itr1, *itr2));
+  }
+  @endcode
+ 
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelReduction for details.
+  */
+  
+  template <
+    typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, 
+    typename P = GuidedPartitioner,
+    std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>* = nullptr
+  >
+  Task transform_reduce(
+    B1 first1, E1 last1, B2 first2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part = P()
+  );
+
+  // ------------------------------------------------------------------------
+  // scan
+  // ------------------------------------------------------------------------
+  
+  /**
+  @brief creates an STL-styled parallel inclusive-scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements using the given binary operator
+  for summation.
+  
+  This function generates an @em inclusive scan, meaning that the N-th element
+  of the output range is the sum of the first N input elements,
+  so the N-th input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {1, 3, 6, 10, 15}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP>
+  Task inclusive_scan(B first, E last, D d_first, BOP bop);
+  
+  /**
+  @brief creates an STL-styled parallel inclusive-scan task with an initial value
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param init initial value
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements (and the initial value)
+  using the given binary operator for summation.
+  
+  This function generates an @em inclusive scan, meaning the N-th element
+  of the output range is the sum of the first N input elements,
+  so the N-th input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, -1
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {0, 2, 5, 9, 14}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+ 
+  Please refer to @ref ParallelScan for details.
+
+  */
+  template <typename B, typename E, typename D, typename BOP, typename T>
+  Task inclusive_scan(B first, E last, D d_first, BOP bop, T init);
+  
+  /**
+  @brief creates an STL-styled parallel exclusive-scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam T initial value type
+  @tparam BOP summation operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param init initial value
+  @param bop function to perform summation
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements (and the initial value)
+  using the given binary operator for summation.
+  
+  This function generates an @em exclusive scan, meaning the N-th element
+  of the output range is the sum of the first N-1 input elements,
+  so the N-th input element is not included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.exclusive_scan(
+    input.begin(), input.end(), input.begin(), -1, std::plus<int>{}
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, 0, 2, 5, 9}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename T, typename BOP>
+  Task exclusive_scan(B first, E last, D d_first, T init, BOP bop);
+  
+  // ------------------------------------------------------------------------
+  // transform scan
+  // ------------------------------------------------------------------------
+  
+  /**
+  @brief creates an STL-styled parallel transform-inclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em inclusive scan, meaning the Nth element
+  of the output range is the sum of the first N input elements,
+  so the Nth input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
+    [] (int item) { return -item; }
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, -3, -6, -10, -15}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP, typename UOP>
+  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop);
+  
+  /**
+  @brief creates an STL-styled parallel transform-inclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+  @param init initial value
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements (including an initial value)
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em inclusive scan, meaning the Nth element
+  of the output range is the sum of the first N input elements,
+  so the Nth input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
+    [] (int item) { return -item; },
+    -1
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-2, -4, -7, -11, -16}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init);
+  
+  /**
+  @brief creates an STL-styled parallel transform-exclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+  @param init initial value
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements (including an initial value)
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em exclusive scan, meaning the Nth element
+  of the output range is the sum of the first N-1 input elements,
+  so the Nth input element is not included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_exclusive_scan(
+    input.begin(), input.end(), input.begin(), -1, std::plus<int>{},
+    [](int item) { return -item; }
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, -2, -4, -7, -11}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+  Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop);
+
+  // ------------------------------------------------------------------------
+  // find
+  // ------------------------------------------------------------------------
+ 
+  /**
+  @brief constructs a task to perform STL-styled find-if algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam UOP unary predicate type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param predicate unary predicate which returns @c true for the required element
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
+  that satisfies the given criteria (or last if there is no such iterator).
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
+    for (; first != last; ++first) {
+      if (predicate(*first)){
+        return first;
+      }
+    }
+    return last;
+  }
+  @endcode
+
+  For example, the code below find the element that satisfies the given 
+  criteria (value plus one is equal to 23) from an input range of 10 elements:
+
+  @code{.cpp}
+  std::vector<int> input = {1, 6, 9, 10, 22, 5, 7, 8, 9, 11};
+  std::vector<int>::iterator result;
+  taskflow.find_if(
+    input.begin(), input.end(), [](int i){ return i+1 = 23; }, result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 22);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner>
+  Task find_if(B first, E last, T& result, UOP predicate, P&& part = P());
+  
+  /**
+  @brief constructs a task to perform STL-styled find-if-not algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam UOP unary predicate type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param predicate unary predicate which returns @c false for the required element
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
+  that satisfies the given criteria (or last if there is no such iterator).
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
+    for (; first != last; ++first) {
+      if (!predicate(*first)){
+        return first;
+      }
+    }
+    return last;
+  }
+  @endcode
+
+  For example, the code below find the element that satisfies the given 
+  criteria (value is not equal to 1) from an input range of 10 elements:
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 22, 1, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.find_if_not(
+    input.begin(), input.end(), [](int i){ return i == 1; }, result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 22);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename UOP,typename P = GuidedPartitioner>
+  Task find_if_not(B first, E last, T& result, UOP predicate, P&& part = P());
+
+  /**
+  @brief constructs a task to perform STL-styled min-element algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam C comparator type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param comp comparison function object
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Finds the smallest element in the <tt>[first, last)</tt> 
+  using the given comparison function object.
+  The iterator to that smallest element is stored in @c result.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  if (first == last) {
+    return last;
+  }
+  auto smallest = first;
+  ++first;
+  for (; first != last; ++first) {
+    if (comp(*first, *smallest)) {
+      smallest = first;
+    }
+  }
+  return smallest;
+  @endcode
+
+  For example, the code below find the smallest element from an input
+  range of 10 elements.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 1, -1, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.min_element(
+    input.begin(), input.end(), std::less<int>(), result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == -1);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename C, typename P>
+  Task min_element(B first, E last, T& result, C comp, P&& part);
+  
+  /**
+  @brief constructs a task to perform STL-styled max-element algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam C comparator type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param comp comparison function object
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Finds the largest element in the <tt>[first, last)</tt> 
+  using the given comparison function object.
+  The iterator to that largest element is stored in @c result.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  if (first == last){
+    return last;
+  }
+  auto largest = first;
+  ++first;
+  for (; first != last; ++first) {
+    if (comp(*largest, *first)) {
+      largest = first;
+    }
+  }
+  return largest;
+  @endcode
+
+  For example, the code below find the largest element from an input
+  range of 10 elements.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 1, 2, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.max_element(
+    input.begin(), input.end(), std::less<int>(), result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 2);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename C, typename P>
+  Task max_element(B first, E last, T& result, C comp, P&& part);
+
+  // ------------------------------------------------------------------------
+  // sort
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs a dynamic task to perform STL-styled parallel sort
+
+  @tparam B beginning iterator type (random-accessible)
+  @tparam E ending iterator type (random-accessible)
+  @tparam C comparator type
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param cmp comparison operator
+
+  The task spawns asynchronous tasks to sort elements in the range
+  <tt>[first, last)</tt> in parallel.
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelSort for details.
+  */
+  template <typename B, typename E, typename C>
+  Task sort(B first, E last, C cmp);
+
+  /**
+  @brief constructs a dynamic task to perform STL-styled parallel sort using
+         the @c std::less<T> comparator, where @c T is the element type
+
+  @tparam B beginning iterator type (random-accessible)
+  @tparam E ending iterator type (random-accessible)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+
+  The task spawns asynchronous tasks to parallelly sort elements in the range
+  <tt>[first, last)</tt> using the @c std::less<T> comparator,
+  where @c T is the dereferenced iterator type.
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelSort for details.
+   */
+  template <typename B, typename E>
+  Task sort(B first, E last);
+
+  protected:
+
+  /**
+  @brief associated graph object
+  */
+  Graph& _graph;
+
+  private:
+
+  template <typename L>
+  void _linearize(L&);
+};
+
+// Constructor
+inline FlowBuilder::FlowBuilder(Graph& graph) :
+  _graph {graph} {
+}
+
+// Function: emplace
+template <typename C, std::enable_if_t<is_static_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& c) {
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Static>{}, std::forward<C>(c)
+  ));
+}
+
+// Function: emplace
+template <typename C, std::enable_if_t<is_dynamic_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& c) {
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Dynamic>{}, std::forward<C>(c)
+  ));
+}
+
+// Function: emplace
+template <typename C, std::enable_if_t<is_condition_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& c) {
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Condition>{}, std::forward<C>(c)
+  ));
+}
+
+// Function: emplace
+template <typename C, std::enable_if_t<is_multi_condition_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& c) {
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::MultiCondition>{}, std::forward<C>(c)
+  ));
+}
+
+// Function: emplace
+template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>*>
+auto FlowBuilder::emplace(C&&... cs) {
+  return std::make_tuple(emplace(std::forward<C>(cs))...);
+}
+
+// Function: erase
+inline void FlowBuilder::erase(Task task) {
+
+  if (!task._node) {
+    return;
+  }
+
+  task.for_each_dependent([&] (Task dependent) {
+    auto& S = dependent._node->_successors;
+    if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) {
+      S.erase(I);
+    }
+  });
+
+  task.for_each_successor([&] (Task dependent) {
+    auto& D = dependent._node->_dependents;
+    if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) {
+      D.erase(I);
+    }
+  });
+
+  _graph._erase(task._node);
+}
+
+// Function: composed_of
+template <typename T>
+Task FlowBuilder::composed_of(T& object) {
+  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Module>{}, object
+  );
+  return Task(node);
+}
+
+// Function: placeholder
+inline Task FlowBuilder::placeholder() {
+  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Placeholder>{}
+  );
+  return Task(node);
+}
+
+// Procedure: _linearize
+template <typename L>
+void FlowBuilder::_linearize(L& keys) {
+
+  auto itr = keys.begin();
+  auto end = keys.end();
+
+  if(itr == end) {
+    return;
+  }
+
+  auto nxt = itr;
+
+  for(++nxt; nxt != end; ++nxt, ++itr) {
+    itr->_node->_precede(nxt->_node);
+  }
+}
+
+// Procedure: linearize
+inline void FlowBuilder::linearize(std::vector<Task>& keys) {
+  _linearize(keys);
+}
+
+// Procedure: linearize
+inline void FlowBuilder::linearize(std::initializer_list<Task> keys) {
+  _linearize(keys);
+}
+
+// ----------------------------------------------------------------------------
+
+/**
+@class Subflow
+
+@brief class to construct a subflow graph from the execution of a dynamic task
+
+tf::Subflow is a derived class from tf::Runtime with a specialized mechanism
+to manage the execution of a child graph.
+By default, a subflow automatically @em joins its parent node.
+You may explicitly join or detach a subflow by calling tf::Subflow::join
+or tf::Subflow::detach, respectively.
+The following example creates a taskflow graph that spawns a subflow from
+the execution of task @c B, and the subflow contains three tasks, @c B1,
+@c B2, and @c B3, where @c B3 runs after @c B1 and @c B2.
+
+@code{.cpp}
+// create three static tasks
+tf::Task A = taskflow.emplace([](){}).name("A");
+tf::Task C = taskflow.emplace([](){}).name("C");
+tf::Task D = taskflow.emplace([](){}).name("D");
+
+// create a subflow graph (dynamic tasking)
+tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) {
+  tf::Task B1 = subflow.emplace([](){}).name("B1");
+  tf::Task B2 = subflow.emplace([](){}).name("B2");
+  tf::Task B3 = subflow.emplace([](){}).name("B3");
+  B1.precede(B3);
+  B2.precede(B3);
+}).name("B");
+
+A.precede(B);  // B runs after A
+A.precede(C);  // C runs after A
+B.precede(D);  // D runs after B
+C.precede(D);  // D runs after C
+@endcode
+
+*/
+class Subflow : public FlowBuilder,
+                public Runtime {
+
+  friend class Executor;
+  friend class FlowBuilder;
+  friend class Runtime;
+
+  public:
+
+    /**
+    @brief enables the subflow to join its parent task
+
+    Performs an immediate action to join the subflow. Once the subflow is joined,
+    it is considered finished and you may not modify the subflow anymore.
+
+    @code{.cpp}
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      sf.join();  // join the subflow of one task
+    });
+    @endcode
+
+    Only the worker that spawns this subflow can join it.
+    */
+    void join();
+
+    /**
+    @brief enables the subflow to detach from its parent task
+
+    Performs an immediate action to detach the subflow. Once the subflow is detached,
+    it is considered finished and you may not modify the subflow anymore.
+
+    @code{.cpp}
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      sf.detach();
+    });
+    @endcode
+
+    Only the worker that spawns this subflow can detach it.
+    */
+    void detach();
+
+    /**
+    @brief resets the subflow to a joinable state
+
+    @param clear_graph specifies whether to clear the associated graph (default @c true)
+
+    Clears the underlying task graph depending on the 
+    given variable @c clear_graph (default @c true) and then
+    updates the subflow to a joinable state.
+    */
+    void reset(bool clear_graph = true);
+
+    /**
+    @brief queries if the subflow is joinable
+
+    This member function queries if the subflow is joinable.
+    When a subflow is joined or detached, it becomes not joinable.
+
+    @code{.cpp}
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      std::cout << sf.joinable() << '\n';  // true
+      sf.join();
+      std::cout << sf.joinable() << '\n';  // false
+    });
+    @endcode
+    */
+    bool joinable() const noexcept;
+
+  private:
+
+    bool _joinable {true};
+
+    Subflow(Executor&, Worker&, Node*, Graph&);
+};
+
+// Constructor
+inline Subflow::Subflow(
+  Executor& executor, Worker& worker, Node* parent, Graph& graph
+) :
+  FlowBuilder {graph},
+  Runtime {executor, worker, parent} {
+  // assert(_parent != nullptr);
+}
+
+// Function: joined
+inline bool Subflow::joinable() const noexcept {
+  return _joinable;
+}
+
+// Procedure: reset
+inline void Subflow::reset(bool clear_graph) {
+  if(clear_graph) {
+    _graph._clear();
+  }
+  _joinable = true;
+}
+
+}  // end of namespace tf. ---------------------------------------------------
+
+
+
+
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/core/graph.hpp b/myxpcs/include/taskflow_/core/graph.hpp
new file mode 100644
index 0000000..f7af3e9
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/graph.hpp
@@ -0,0 +1,1017 @@
+#pragma once
+
+#include "../utility/traits.hpp"
+#include "../utility/iterator.hpp"
+#include "../utility/object_pool.hpp"
+#include "../utility/os.hpp"
+#include "../utility/math.hpp"
+#include "../utility/small_vector.hpp"
+#include "../utility/serializer.hpp"
+#include "error.hpp"
+#include "declarations.hpp"
+#include "semaphore.hpp"
+#include "environment.hpp"
+#include "topology.hpp"
+#include "tsq.hpp"
+
+/**
+@file graph.hpp
+@brief graph include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Class: Graph
+// ----------------------------------------------------------------------------
+
+/**
+@class Graph
+
+@brief class to create a graph object
+
+A graph is the ultimate storage for a task dependency graph and is the main
+gateway to interact with an executor.
+A graph manages a set of nodes in a global object pool that animates and
+recycles node objects efficiently without going through repetitive and
+expensive memory allocations and deallocations.
+This class is mainly used for creating an opaque graph object in a custom
+class to interact with the executor through taskflow composition.
+
+A graph object is move-only.
+*/
+class Graph {
+
+  friend class Node;
+  friend class FlowBuilder;
+  friend class Subflow;
+  friend class Taskflow;
+  friend class Executor;
+
+  public:
+
+    /**
+    @brief constructs a graph object
+    */
+    Graph() = default;
+
+    /**
+    @brief disabled copy constructor
+    */
+    Graph(const Graph&) = delete;
+
+    /**
+    @brief constructs a graph using move semantics
+    */
+    Graph(Graph&&);
+
+    /**
+    @brief destructs the graph object
+    */
+    ~Graph();
+
+    /**
+    @brief disabled copy assignment operator
+    */
+    Graph& operator = (const Graph&) = delete;
+
+    /**
+    @brief assigns a graph using move semantics
+    */
+    Graph& operator = (Graph&&);
+
+    /**
+    @brief queries if the graph is empty
+    */
+    bool empty() const;
+
+    /**
+    @brief queries the number of nodes in the graph
+    */
+    size_t size() const;
+
+    /**
+    @brief clears the graph
+    */
+    void clear();
+
+  private:
+
+    std::vector<Node*> _nodes;
+
+    void _clear();
+    void _clear_detached();
+    void _merge(Graph&&);
+    void _erase(Node*);
+    
+    /**
+    @private
+    */
+    template <typename ...ArgsT>
+    Node* _emplace_back(ArgsT&&...);
+};
+
+// ----------------------------------------------------------------------------
+
+/**
+@class Runtime
+
+@brief class to include a runtime object in a task
+
+A runtime object allows users to interact with the
+scheduling runtime inside a task, such as scheduling an active task,
+spawning a subflow, and so on.
+
+@code{.cpp}
+tf::Task A, B, C, D;
+std::tie(A, B, C, D) = taskflow.emplace(
+  [] () { return 0; },
+  [&C] (tf::Runtime& rt) {  // C must be captured by reference
+    std::cout << "B\n";
+    rt.schedule(C);
+  },
+  [] () { std::cout << "C\n"; },
+  [] () { std::cout << "D\n"; }
+);
+A.precede(B, C, D);
+executor.run(taskflow).wait();
+@endcode
+
+A runtime object is associated with the worker and the executor
+that runs the task.
+
+*/
+class Runtime {
+
+  friend class Executor;
+  friend class FlowBuilder;
+
+  public:
+  
+  /**
+  @brief destroys the runtime object
+
+  Issues a tf::Runtime::corun_all to finish all spawned asynchronous tasks
+  and then destroys the runtime object.
+  */
+  ~Runtime();
+
+  /**
+  @brief obtains the running executor
+
+  The running executor of a runtime task is the executor that runs
+  the parent taskflow of that runtime task.
+
+  @code{.cpp}
+  tf::Executor executor;
+  tf::Taskflow taskflow;
+  taskflow.emplace([&](tf::Runtime& rt){
+    assert(&(rt.executor()) == &executor);
+  });
+  executor.run(taskflow).wait();
+  @endcode
+  */
+  Executor& executor();
+
+  /**
+  @brief schedules an active task immediately to the worker's queue
+
+  @param task the given active task to schedule immediately
+
+  This member function immediately schedules an active task to the
+  task queue of the associated worker in the runtime task.
+  An active task is a task in a running taskflow.
+  The task may or may not be running, and scheduling that task
+  will immediately put the task into the task queue of the worker
+  that is running the runtime task.
+  Consider the following example:
+
+  @code{.cpp}
+  tf::Task A, B, C, D;
+  std::tie(A, B, C, D) = taskflow.emplace(
+    [] () { return 0; },
+    [&C] (tf::Runtime& rt) {  // C must be captured by reference
+      std::cout << "B\n";
+      rt.schedule(C);
+    },
+    [] () { std::cout << "C\n"; },
+    [] () { std::cout << "D\n"; }
+  );
+  A.precede(B, C, D);
+  executor.run(taskflow).wait();
+  @endcode
+
+  The executor will first run the condition task @c A which returns @c 0
+  to inform the scheduler to go to the runtime task @c B.
+  During the execution of @c B, it directly schedules task @c C without
+  going through the normal taskflow graph scheduling process.
+  At this moment, task @c C is active because its parent taskflow is running.
+  When the taskflow finishes, we will see both @c B and @c C in the output.
+  */
+  void schedule(Task task);
+  
+  /**
+  @brief runs the given callable asynchronously
+
+  @tparam F callable type
+  @param f callable object
+    
+  The method creates an asynchronous task to launch the given
+  function on the given arguments.
+  The difference to tf::Executor::async is that the created asynchronous task
+  pertains to the runtime object.
+  Applications can explicitly issue tf::Runtime::corun_all
+  to wait for all spawned asynchronous tasks to finish.
+  For example:
+
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    auto fu1 = rt.async([&](){ counter++; });
+    auto fu2 = rt.async([&](){ counter++; });
+    fu1.get();
+    fu2.get();
+    assert(counter == 2);
+    
+    // spawn 100 asynchronous tasks from the worker of the runtime
+    for(int i=0; i<100; i++) {
+      rt.async([&](){ counter++; });
+    }
+    
+    // wait for the 100 asynchronous tasks to finish
+    rt.corun_all();
+    assert(counter == 102);
+  });
+  @endcode
+
+  This method is thread-safe and can be called by multiple workers
+  that hold the reference to the runtime.
+  For example, the code below spawns 100 tasks from the worker of
+  a runtime, and each of the 100 tasks spawns another task
+  that will be run by another worker.
+  
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    // worker of the runtime spawns 100 tasks each spawning another task
+    // that will be run by another worker
+    for(int i=0; i<100; i++) {
+      rt.async([&](){ 
+        counter++; 
+        rt.async([](){ counter++; });
+      });
+    }
+    
+    // wait for the 200 asynchronous tasks to finish
+    rt.corun_all();
+    assert(counter == 200);
+  });
+  @endcode
+  */
+  template <typename F>
+  auto async(F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::async but assigns the task a name
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param f callable
+
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    auto future = rt.async("my task", [](){});
+    future.get();
+  });
+  @endcode
+
+  */
+  template <typename F>
+  auto async(const std::string& name, F&& f);
+
+  /**
+  @brief runs the given function asynchronously without returning any future object
+
+  @tparam F callable type
+  @param f callable
+
+  This member function is more efficient than tf::Runtime::async
+  and is encouraged to use when there is no data returned.
+
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.corun_all();
+    assert(counter == 100);
+  });
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::silent_async but assigns the task a name
+
+  @tparam F callable type
+  @param name assigned name to the task
+  @param f callable
+  
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    rt.silent_async("my task", [](){});
+    rt.corun_all();
+  });
+  @endcode
+  */
+  template <typename F>
+  void silent_async(const std::string& name, F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param f callable
+
+  The method bypass the check of the caller worker from the executor 
+  and thus can only called by the worker of this runtime.
+
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    // running by the worker of this runtime
+    rt.silent_async_unchecked("my task", [](){});
+    rt.corun_all();
+  });
+  @endcode
+  */
+  template <typename F>
+  void silent_async_unchecked(const std::string& name, F&& f);
+
+  /**
+  @brief co-runs the given target and waits until it completes
+  
+  A target can be one of the following forms:
+    + a dynamic task to spawn a subflow or
+    + a composable graph object with `tf::Graph& T::graph()` defined
+
+  @code{.cpp}
+  // co-run a subflow and wait until all tasks complete
+  taskflow.emplace([](tf::Runtime& rt){
+    rt.corun([](tf::Subflow& sf){
+      tf::Task A = sf.emplace([](){});
+      tf::Task B = sf.emplace([](){});
+    }); 
+  });
+  
+  // co-run a taskflow and wait until all tasks complete
+  tf::Taskflow taskflow1, taskflow2;
+  taskflow1.emplace([](){ std::cout << "running taskflow1\n"; });
+  taskflow2.emplace([&](tf::Runtime& rt){
+    std::cout << "running taskflow2\n";
+    rt.corun(taskflow1);
+  });
+  executor.run(taskflow2).wait();
+  @endcode
+
+  Although tf::Runtime::corun blocks until the operation completes, 
+  the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). 
+  Instead, the caller thread joins the work-stealing loop of the executor 
+  and returns when all tasks in the target completes.
+  
+  @attention
+  Only the worker of this tf::Runtime can issue corun.
+  */
+  template <typename T>
+  void corun(T&& target);
+
+  /**
+  @brief keeps running the work-stealing loop until the predicate becomes true
+  
+  @tparam P predicate type
+  @param predicate a boolean predicate to indicate when to stop the loop
+
+  The method keeps the caller worker running in the work-stealing loop
+  until the stop predicate becomes true.
+  
+  @attention
+  Only the worker of this tf::Runtime can issue corun.
+  */
+  template <typename P>
+  void corun_until(P&& predicate);
+  
+  /**
+  @brief corun all asynchronous tasks spawned by this runtime with other workers
+
+  Coruns all asynchronous tasks (tf::Runtime::async,
+  tf::Runtime::silent_async) with other workers until all those 
+  asynchronous tasks finish.
+    
+  @code{.cpp}
+  std::atomic<size_t> counter{0};
+  taskflow.emplace([&](tf::Runtime& rt){
+    // spawn 100 async tasks and wait
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.corun_all();
+    assert(counter == 100);
+    
+    // spawn another 100 async tasks and wait
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.corun_all();
+    assert(counter == 200);
+  });
+  @endcode
+
+  @attention
+  Only the worker of this tf::Runtime can issue tf::Runtime::corun_all.
+  */
+  inline void corun_all();
+
+  /**
+  @brief acquire a reference to the underlying worker
+  */
+  inline Worker& worker();
+
+  protected:
+  
+  /**
+  @private
+  */
+  explicit Runtime(Executor&, Worker&, Node*);
+  
+  /**
+  @private
+  */
+  Executor& _executor;
+  
+  /**
+  @private
+  */
+  Worker& _worker;
+  
+  /**
+  @private
+  */
+  Node* _parent;
+
+  /**
+  @private
+  */
+  template <typename F>
+  auto _async(Worker& w, const std::string& name, F&& f);
+  
+  /**
+  @private
+  */
+  template <typename F>
+  void _silent_async(Worker& w, const std::string& name, F&& f);
+};
+
+// constructor
+inline Runtime::Runtime(Executor& e, Worker& w, Node* p) :
+  _executor{e},
+  _worker  {w},
+  _parent  {p}{
+}
+
+// Function: executor
+inline Executor& Runtime::executor() {
+  return _executor;
+}
+
+// Function: worker
+inline Worker& Runtime::worker() {
+  return _worker;
+}
+
+// ----------------------------------------------------------------------------
+// Node
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+class Node {
+
+  friend class Graph;
+  friend class Task;
+  friend class AsyncTask;
+  friend class TaskView;
+  friend class Taskflow;
+  friend class Executor;
+  friend class FlowBuilder;
+  friend class Subflow;
+  friend class Runtime;
+
+  enum class AsyncState : int {
+    UNFINISHED = 0,
+    LOCKED = 1,
+    FINISHED = 2
+  };
+
+  TF_ENABLE_POOLABLE_ON_THIS;
+
+  // state bit flag
+  constexpr static int CONDITIONED = 1;
+  constexpr static int DETACHED    = 2;
+  constexpr static int ACQUIRED    = 4;
+  constexpr static int READY       = 8;
+
+  using Placeholder = std::monostate;
+
+  // static work handle
+  struct Static {
+
+    template <typename C>
+    Static(C&&);
+
+    std::variant<
+      std::function<void()>, std::function<void(Runtime&)>
+    > work;
+  };
+
+  // dynamic work handle
+  struct Dynamic {
+
+    template <typename C>
+    Dynamic(C&&);
+
+    std::function<void(Subflow&)> work;
+    Graph subgraph;
+  };
+
+  // condition work handle
+  struct Condition {
+
+    template <typename C>
+    Condition(C&&);
+    
+    std::variant<
+      std::function<int()>, std::function<int(Runtime&)>
+    > work;
+  };
+
+  // multi-condition work handle
+  struct MultiCondition {
+
+    template <typename C>
+    MultiCondition(C&&);
+
+    std::variant<
+      std::function<SmallVector<int>()>, std::function<SmallVector<int>(Runtime&)>
+    > work;
+  };
+
+  // module work handle
+  struct Module {
+
+    template <typename T>
+    Module(T&);
+
+    Graph& graph;
+  };
+
+  // Async work
+  struct Async {
+
+    template <typename T>
+    Async(T&&);
+
+    std::variant<
+      std::function<void()>, std::function<void(Runtime&)>
+    > work;
+  };
+  
+  // silent dependent async
+  struct DependentAsync {
+    
+    template <typename C>
+    DependentAsync(C&&);
+    
+    std::variant<
+      std::function<void()>, std::function<void(Runtime&)>
+    > work;
+   
+    std::atomic<size_t> use_count {1};
+    std::atomic<AsyncState> state {AsyncState::UNFINISHED};
+  };
+
+  using handle_t = std::variant<
+    Placeholder,      // placeholder
+    Static,           // static tasking
+    Dynamic,          // dynamic tasking
+    Condition,        // conditional tasking
+    MultiCondition,   // multi-conditional tasking
+    Module,           // composable tasking
+    Async,            // async tasking
+    DependentAsync    // dependent async tasking
+  >;
+
+  struct Semaphores {
+    SmallVector<Semaphore*> to_acquire;
+    SmallVector<Semaphore*> to_release;
+  };
+
+  public:
+
+  // variant index
+  constexpr static auto PLACEHOLDER     = get_index_v<Placeholder, handle_t>;
+  constexpr static auto STATIC          = get_index_v<Static, handle_t>;
+  constexpr static auto DYNAMIC         = get_index_v<Dynamic, handle_t>;
+  constexpr static auto CONDITION       = get_index_v<Condition, handle_t>;
+  constexpr static auto MULTI_CONDITION = get_index_v<MultiCondition, handle_t>;
+  constexpr static auto MODULE          = get_index_v<Module, handle_t>;
+  constexpr static auto ASYNC           = get_index_v<Async, handle_t>;
+  constexpr static auto DEPENDENT_ASYNC = get_index_v<DependentAsync, handle_t>;
+
+  Node() = default;
+
+  template <typename... Args>
+  Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&... args);
+
+  ~Node();
+
+  size_t num_successors() const;
+  size_t num_dependents() const;
+  size_t num_strong_dependents() const;
+  size_t num_weak_dependents() const;
+
+  const std::string& name() const;
+
+  private:
+
+  std::string _name;
+  
+  unsigned _priority {0};
+  
+  Topology* _topology {nullptr};
+  Node* _parent {nullptr};
+
+  void* _data {nullptr};
+
+  SmallVector<Node*> _successors;
+  SmallVector<Node*> _dependents;
+
+  std::atomic<int> _state {0};
+  std::atomic<size_t> _join_counter {0};
+
+  std::unique_ptr<Semaphores> _semaphores;
+  
+  handle_t _handle;
+
+  void _precede(Node*);
+  void _set_up_join_counter();
+
+  bool _is_cancelled() const;
+  bool _is_conditioner() const;
+  bool _acquire_all(SmallVector<Node*>&);
+
+  SmallVector<Node*> _release_all();
+};
+
+// ----------------------------------------------------------------------------
+// Node Object Pool
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+inline ObjectPool<Node> node_pool;
+
+// ----------------------------------------------------------------------------
+// Definition for Node::Static
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::Static::Static(C&& c) : work {std::forward<C>(c)} {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node::Dynamic
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::Dynamic::Dynamic(C&& c) : work {std::forward<C>(c)} {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node::Condition
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::Condition::Condition(C&& c) : work {std::forward<C>(c)} {
+}                                        
+
+// ----------------------------------------------------------------------------
+// Definition for Node::MultiCondition
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::MultiCondition::MultiCondition(C&& c) : work {std::forward<C>(c)} {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node::Module
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename T>
+inline Node::Module::Module(T& obj) : graph{ obj.graph() } {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node::Async
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::Async::Async(C&& c) : work {std::forward<C>(c)} {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node::DependentAsync
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename C>
+Node::DependentAsync::DependentAsync(C&& c) : work {std::forward<C>(c)} {
+}
+
+// ----------------------------------------------------------------------------
+// Definition for Node
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename... Args>
+Node::Node(
+  const std::string& name, 
+  unsigned priority,
+  Topology* topology, 
+  Node* parent, 
+  size_t join_counter,
+  Args&&... args
+) :
+  _name         {name},
+  _priority     {priority},
+  _topology     {topology},
+  _parent       {parent},
+  _join_counter {join_counter},
+  _handle       {std::forward<Args>(args)...} {
+}
+
+// Destructor
+inline Node::~Node() {
+  // this is to avoid stack overflow
+
+  if(_handle.index() == DYNAMIC) {
+    // using std::get_if instead of std::get makes this compatible
+    // with older macOS versions
+    // the result of std::get_if is guaranteed to be non-null
+    // due to the index check above
+    auto& subgraph = std::get_if<Dynamic>(&_handle)->subgraph;
+    std::vector<Node*> nodes;
+    nodes.reserve(subgraph.size());
+
+    std::move(
+      subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes)
+    );
+    subgraph._nodes.clear();
+
+    size_t i = 0;
+
+    while(i < nodes.size()) {
+
+      if(nodes[i]->_handle.index() == DYNAMIC) {
+        auto& sbg = std::get_if<Dynamic>(&(nodes[i]->_handle))->subgraph;
+        std::move(
+          sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes)
+        );
+        sbg._nodes.clear();
+      }
+
+      ++i;
+    }
+
+    //auto& np = Graph::_node_pool();
+    for(i=0; i<nodes.size(); ++i) {
+      node_pool.recycle(nodes[i]);
+    }
+  }
+}
+
+// Procedure: _precede
+inline void Node::_precede(Node* v) {
+  _successors.push_back(v);
+  v->_dependents.push_back(this);
+}
+
+// Function: num_successors
+inline size_t Node::num_successors() const {
+  return _successors.size();
+}
+
+// Function: dependents
+inline size_t Node::num_dependents() const {
+  return _dependents.size();
+}
+
+// Function: num_weak_dependents
+inline size_t Node::num_weak_dependents() const {
+  size_t n = 0;
+  for(size_t i=0; i<_dependents.size(); i++) {
+    //if(_dependents[i]->_handle.index() == Node::CONDITION) {
+    if(_dependents[i]->_is_conditioner()) {
+      n++;
+    }
+  }
+  return n;
+}
+
+// Function: num_strong_dependents
+inline size_t Node::num_strong_dependents() const {
+  size_t n = 0;
+  for(size_t i=0; i<_dependents.size(); i++) {
+    //if(_dependents[i]->_handle.index() != Node::CONDITION) {
+    if(!_dependents[i]->_is_conditioner()) {
+      n++;
+    }
+  }
+  return n;
+}
+
+// Function: name
+inline const std::string& Node::name() const {
+  return _name;
+}
+
+// Function: _is_conditioner
+inline bool Node::_is_conditioner() const {
+  return _handle.index() == Node::CONDITION ||
+         _handle.index() == Node::MULTI_CONDITION;
+}
+
+// Function: _is_cancelled
+// we currently only support cancellation of taskflow (no async task)
+inline bool Node::_is_cancelled() const {
+  //return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed);
+  return _topology &&
+         (_topology->_state.load(std::memory_order_relaxed) & Topology::CANCELLED);
+}
+
+// Procedure: _set_up_join_counter
+inline void Node::_set_up_join_counter() {
+  size_t c = 0;
+  for(auto p : _dependents) {
+    //if(p->_handle.index() == Node::CONDITION) {
+    if(p->_is_conditioner()) {
+      _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed);
+    }
+    else {
+      c++;
+    }
+  }
+  _join_counter.store(c, std::memory_order_relaxed);
+}
+
+
+// Function: _acquire_all
+inline bool Node::_acquire_all(SmallVector<Node*>& nodes) {
+
+  auto& to_acquire = _semaphores->to_acquire;
+
+  for(size_t i = 0; i < to_acquire.size(); ++i) {
+    if(!to_acquire[i]->_try_acquire_or_wait(this)) {
+      for(size_t j = 1; j <= i; ++j) {
+        auto r = to_acquire[i-j]->_release();
+        nodes.insert(std::end(nodes), std::begin(r), std::end(r));
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+// Function: _release_all
+inline SmallVector<Node*> Node::_release_all() {
+
+  auto& to_release = _semaphores->to_release;
+
+  SmallVector<Node*> nodes;
+  for(const auto& sem : to_release) {
+    auto r = sem->_release();
+    nodes.insert(std::end(nodes), std::begin(r), std::end(r));
+  }
+
+  return nodes;
+}
+
+// ----------------------------------------------------------------------------
+// Node Deleter
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+struct NodeDeleter {
+  void operator ()(Node* ptr) {
+    node_pool.recycle(ptr);
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Graph definition
+// ----------------------------------------------------------------------------
+
+// Destructor
+inline Graph::~Graph() {
+  _clear();
+}
+
+// Move constructor
+inline Graph::Graph(Graph&& other) :
+  _nodes {std::move(other._nodes)} {
+}
+
+// Move assignment
+inline Graph& Graph::operator = (Graph&& other) {
+  _clear();
+  _nodes = std::move(other._nodes);
+  return *this;
+}
+
+// Procedure: clear
+inline void Graph::clear() {
+  _clear();
+}
+
+// Procedure: clear
+inline void Graph::_clear() {
+  for(auto node : _nodes) {
+    node_pool.recycle(node);
+  }
+  _nodes.clear();
+}
+
+// Procedure: clear_detached
+inline void Graph::_clear_detached() {
+
+  auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) {
+    return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED);
+  });
+
+  for(auto itr = mid; itr != _nodes.end(); ++itr) {
+    node_pool.recycle(*itr);
+  }
+  _nodes.resize(std::distance(_nodes.begin(), mid));
+}
+
+// Procedure: merge
+inline void Graph::_merge(Graph&& g) {
+  for(auto n : g._nodes) {
+    _nodes.push_back(n);
+  }
+  g._nodes.clear();
+}
+
+// Function: erase
+inline void Graph::_erase(Node* node) {
+  if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) {
+    _nodes.erase(I);
+    node_pool.recycle(node);
+  }
+}
+
+// Function: size
+inline size_t Graph::size() const {
+  return _nodes.size();
+}
+
+// Function: empty
+inline bool Graph::empty() const {
+  return _nodes.empty();
+}
+
+/**
+@private
+*/
+template <typename ...ArgsT>
+Node* Graph::_emplace_back(ArgsT&&... args) {
+  _nodes.push_back(node_pool.animate(std::forward<ArgsT>(args)...));
+  return _nodes.back();
+}
+
+}  // end of namespace tf. ---------------------------------------------------
diff --git a/myxpcs/include/taskflow_/core/notifier.hpp b/myxpcs/include/taskflow_/core/notifier.hpp
new file mode 100644
index 0000000..6bec325
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/notifier.hpp
@@ -0,0 +1,295 @@
+// 2019/02/09 - created by Tsung-Wei Huang
+//  - modified the event count from Eigen
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <cstdio>
+#include <atomic>
+#include <memory>
+#include <deque>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <algorithm>
+#include <numeric>
+#include <cassert>
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace tf {
+
+// Notifier allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   Notifier::Waiter& w = waiters[my_index];
+//   ec.prepare_wait(&w);
+//   if (predicate) {
+//     ec.cancel_wait(&w);
+//     return act();
+//   }
+//   ec.commit_wait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.notify(true);
+//
+// notify is cheap if there are no waiting threads. prepare_wait/commit_wait are not
+// cheap, but they are executed only if the preceeding predicate check has
+// failed.
+//
+// Algorihtm outline:
+// There are two main variables: predicate (managed by user) and _state.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets _state then checks predicate, Notifying thread sets
+// predicate then checks _state. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see _state change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class Notifier {
+
+  friend class Executor;
+
+  public:
+
+  struct Waiter {
+    std::atomic<Waiter*> next;
+    uint64_t epoch;
+    enum : unsigned {
+      kNotSignaled = 0,
+      kWaiting,
+      kSignaled,
+    };
+
+#ifdef __cpp_lib_atomic_wait
+    std::atomic<unsigned> state {0};
+#else
+    std::mutex mu;
+    std::condition_variable cv;
+    unsigned state;
+#endif
+  };
+
+  explicit Notifier(size_t N) : _waiters{N} {
+    assert(_waiters.size() < (1 << kWaiterBits) - 1);
+    // Initialize epoch to something close to overflow to test overflow.
+    _state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2);
+  }
+
+  ~Notifier() {
+    // Ensure there are no waiters.
+    assert((_state.load() & (kStackMask | kWaiterMask)) == kStackMask);
+  }
+
+  // prepare_wait prepares for waiting.
+  // After calling this function the thread must re-check the wait predicate
+  // and call either cancel_wait or commit_wait passing the same Waiter object.
+  void prepare_wait(Waiter* w) {
+    w->epoch = _state.fetch_add(kWaiterInc, std::memory_order_relaxed);
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+  }
+
+  // commit_wait commits waiting.
+  void commit_wait(Waiter* w) {
+#ifdef __cpp_lib_atomic_wait
+    w->state.store(Waiter::kNotSignaled, std::memory_order_relaxed);
+#else
+    w->state = Waiter::kNotSignaled;
+#endif
+    // Modification epoch of this waiter.
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = _state.load(std::memory_order_seq_cst);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either cancel_wait or commit_wait, or is notified.
+        std::this_thread::yield();
+        state = _state.load(std::memory_order_seq_cst);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter and add it to the waiter list.
+      assert((state & kWaiterMask) != 0);
+      uint64_t newstate = state - kWaiterInc + kEpochInc;
+      //newstate = (newstate & ~kStackMask) | (w - &_waiters[0]);
+      newstate = static_cast<uint64_t>((newstate & ~kStackMask) | static_cast<uint64_t>(w - &_waiters[0]));
+      if ((state & kStackMask) == kStackMask)
+        w->next.store(nullptr, std::memory_order_relaxed);
+      else
+        w->next.store(&_waiters[state & kStackMask], std::memory_order_relaxed);
+      if (_state.compare_exchange_weak(state, newstate,
+                                       std::memory_order_release))
+        break;
+    }
+    _park(w);
+  }
+
+  // cancel_wait cancels effects of the previous prepare_wait call.
+  void cancel_wait(Waiter* w) {
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = _state.load(std::memory_order_relaxed);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either cancel_wait or commit_wait, or is notified.
+        std::this_thread::yield();
+        state = _state.load(std::memory_order_relaxed);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter.
+      assert((state & kWaiterMask) != 0);
+      if (_state.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
+                                       std::memory_order_relaxed))
+        return;
+    }
+  }
+
+  // notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void notify(bool all) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = _state.load(std::memory_order_acquire);
+    for (;;) {
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
+        return;
+      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      uint64_t newstate;
+      if (all) {
+        // Reset prewait counter and empty wait list.
+        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
+      } else if (waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kEpochInc - kWaiterInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &_waiters[state & kStackMask];
+        Waiter* wnext = w->next.load(std::memory_order_relaxed);
+        uint64_t next = kStackMask;
+        //if (wnext != nullptr) next = wnext - &_waiters[0];
+        if (wnext != nullptr) next = static_cast<uint64_t>(wnext - &_waiters[0]);
+        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
+        // can't happen because a waiter is re-pushed onto the stack only after
+        // it was in the pre-wait state which inevitably leads to epoch
+        // increment.
+        newstate = (state & kEpochMask) + next;
+      }
+      if (_state.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acquire)) {
+        if (!all && waiters) return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &_waiters[state & kStackMask];
+        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+        _unpark(w);
+        return;
+      }
+    }
+  }
+
+  // notify n workers
+  void notify_n(size_t n) {
+    if(n >= _waiters.size()) {
+      notify(true);
+    }
+    else {
+      for(size_t k=0; k<n; ++k) {
+        notify(false);
+      }
+    }
+  }
+
+  size_t size() const {
+    return _waiters.size();
+  }
+
+ private:
+
+  // State_ layout:
+  // - low kStackBits is a stack of waiters committed wait.
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kEpochBits is modification counter.
+  static const uint64_t kStackBits = 16;
+  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
+  static const uint64_t kWaiterBits = 16;
+  static const uint64_t kWaiterShift = 16;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+                                      << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
+  static const uint64_t kEpochBits = 32;
+  static const uint64_t kEpochShift = 32;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+  std::atomic<uint64_t> _state;
+  std::vector<Waiter> _waiters;
+
+  void _park(Waiter* w) {
+#ifdef __cpp_lib_atomic_wait
+    unsigned target = Waiter::kNotSignaled;
+    if(w->state.compare_exchange_strong(target, Waiter::kWaiting,
+                                        std::memory_order_relaxed,
+                                        std::memory_order_relaxed)) {
+      w->state.wait(Waiter::kWaiting, std::memory_order_relaxed);
+    }
+#else
+    std::unique_lock<std::mutex> lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+#endif
+  }
+
+  void _unpark(Waiter* waiters) {
+    Waiter* next = nullptr;
+    for (Waiter* w = waiters; w; w = next) {
+      next = w->next.load(std::memory_order_relaxed);
+#ifdef __cpp_lib_atomic_wait
+      // We only notify if the other is waiting - this is why we use tri-state
+      // variable instead of binary-state variable (i.e., atomic_flag)
+      // Performance is about 0.1% faster
+      if(w->state.exchange(Waiter::kSignaled, std::memory_order_relaxed) == 
+         Waiter::kWaiting) {
+        w->state.notify_one();
+      }
+#else
+      unsigned state;
+      {
+        std::unique_lock<std::mutex> lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+#endif
+    }
+  }
+
+};
+
+
+
+}  // namespace tf ------------------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/core/observer.hpp b/myxpcs/include/taskflow_/core/observer.hpp
new file mode 100644
index 0000000..3c1873e
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/observer.hpp
@@ -0,0 +1,1046 @@
+#pragma once
+
+#include "task.hpp"
+#include "worker.hpp"
+
+/** 
+@file observer.hpp
+@brief observer include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// timeline data structure
+// ----------------------------------------------------------------------------
+
+/**
+@brief default time point type of observers
+*/
+using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>;
+
+/**
+@private
+*/
+struct Segment {
+
+  std::string name;
+  TaskType type;
+
+  observer_stamp_t beg;
+  observer_stamp_t end;
+
+  template <typename Archiver>
+  auto save(Archiver& ar) const {
+    return ar(name, type, beg, end);
+  }
+
+  template <typename Archiver>
+  auto load(Archiver& ar) {
+    return ar(name, type, beg, end);
+  }
+
+  Segment() = default;
+
+  Segment(
+    const std::string& n, TaskType t, observer_stamp_t b, observer_stamp_t e
+  ) : name {n}, type {t}, beg {b}, end {e} {
+  }
+
+  auto span() const {
+    return end-beg;
+  } 
+};
+
+/**
+@private
+*/
+struct Timeline {
+
+  size_t uid;
+
+  observer_stamp_t origin;
+  std::vector<std::vector<std::vector<Segment>>> segments;
+
+  Timeline() = default;
+
+  Timeline(const Timeline& rhs) = delete;
+  Timeline(Timeline&& rhs) = default;
+
+  Timeline& operator = (const Timeline& rhs) = delete;
+  Timeline& operator = (Timeline&& rhs) = default;
+
+  template <typename Archiver>
+  auto save(Archiver& ar) const {
+    return ar(uid, origin, segments);
+  }
+
+  template <typename Archiver>
+  auto load(Archiver& ar) {
+    return ar(uid, origin, segments);
+  }
+};  
+
+/**
+@private
+ */
+struct ProfileData {
+
+  std::vector<Timeline> timelines;
+
+  ProfileData() = default;
+
+  ProfileData(const ProfileData& rhs) = delete;
+  ProfileData(ProfileData&& rhs) = default;
+
+  ProfileData& operator = (const ProfileData& rhs) = delete;
+  ProfileData& operator = (ProfileData&&) = default;
+  
+  template <typename Archiver>
+  auto save(Archiver& ar) const {
+    return ar(timelines);
+  }
+
+  template <typename Archiver>
+  auto load(Archiver& ar) {
+    return ar(timelines);
+  }
+};
+
+// ----------------------------------------------------------------------------
+// observer interface 
+// ----------------------------------------------------------------------------
+
+/**
+@class: ObserverInterface
+
+@brief class to derive an executor observer 
+
+The tf::ObserverInterface class allows users to define custom methods to monitor 
+the behaviors of an executor. This is particularly useful when you want to 
+inspect the performance of an executor and visualize when each thread 
+participates in the execution of a task.
+To prevent users from direct access to the internal threads and tasks, 
+tf::ObserverInterface provides immutable wrappers,
+tf::WorkerView and tf::TaskView, over workers and tasks.
+
+Please refer to tf::WorkerView and tf::TaskView for details.
+
+Example usage:
+
+@code{.cpp}
+
+struct MyObserver : public tf::ObserverInterface {
+
+  MyObserver(const std::string& name) {
+    std::cout << "constructing observer " << name << '\n';
+  }
+
+  void set_up(size_t num_workers) override final {
+    std::cout << "setting up observer with " << num_workers << " workers\n";
+  }
+
+  void on_entry(WorkerView w, tf::TaskView tv) override final {
+    std::ostringstream oss;
+    oss << "worker " << w.id() << " ready to run " << tv.name() << '\n';
+    std::cout << oss.str();
+  }
+
+  void on_exit(WorkerView w, tf::TaskView tv) override final {
+    std::ostringstream oss;
+    oss << "worker " << w.id() << " finished running " << tv.name() << '\n';
+    std::cout << oss.str();
+  }
+};
+  
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+// insert tasks into taskflow
+// ...
+  
+// create a custom observer
+std::shared_ptr<MyObserver> observer = executor.make_observer<MyObserver>("MyObserver");
+
+// run the taskflow
+executor.run(taskflow).wait();
+@endcode
+*/
+class ObserverInterface {
+
+  public:
+
+  /**
+  @brief virtual destructor
+  */
+  virtual ~ObserverInterface() = default;
+  
+  /**
+  @brief constructor-like method to call when the executor observer is fully created
+  @param num_workers the number of the worker threads in the executor
+  */
+  virtual void set_up(size_t num_workers) = 0;
+  
+  /**
+  @brief method to call before a worker thread executes a closure 
+  @param wv an immutable view of this worker thread 
+  @param task_view a constant wrapper object to the task 
+  */
+  virtual void on_entry(WorkerView wv, TaskView task_view) = 0;
+  
+  /**
+  @brief method to call after a worker thread executed a closure
+  @param wv an immutable view of this worker thread
+  @param task_view a constant wrapper object to the task
+  */
+  virtual void on_exit(WorkerView wv, TaskView task_view) = 0;
+};
+
+// ----------------------------------------------------------------------------
+// ChromeObserver definition
+// ----------------------------------------------------------------------------
+
+/**
+@class: ChromeObserver
+
+@brief class to create an observer based on Chrome tracing format
+
+A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump
+the observed thread activities into a format that can be visualized through
+@ChromeTracing.
+
+@code{.cpp}
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+// insert tasks into taskflow
+// ...
+  
+// create a custom observer
+std::shared_ptr<tf::ChromeObserver> observer = executor.make_observer<tf::ChromeObserver>();
+
+// run the taskflow
+executor.run(taskflow).wait();
+
+// dump the thread activities to a chrome-tracing format.
+observer->dump(std::cout);
+@endcode
+*/
+class ChromeObserver : public ObserverInterface {
+
+  friend class Executor;
+  
+  // data structure to record each task execution
+  struct Segment {
+
+    std::string name;
+
+    observer_stamp_t beg;
+    observer_stamp_t end;
+
+    Segment(
+      const std::string& n,
+      observer_stamp_t b,
+      observer_stamp_t e
+    );
+  };
+  
+  // data structure to store the entire execution timeline
+  struct Timeline {
+    observer_stamp_t origin;
+    std::vector<std::vector<Segment>> segments;
+    std::vector<std::stack<observer_stamp_t>> stacks;
+  };  
+
+  public:
+
+    /**
+    @brief dumps the timelines into a @ChromeTracing format through 
+           an output stream 
+    */
+    void dump(std::ostream& ostream) const;
+
+    /**
+    @brief dumps the timelines into a @ChromeTracing format
+    */
+    inline std::string dump() const;
+
+    /**
+    @brief clears the timeline data
+    */
+    inline void clear();
+
+    /**
+    @brief queries the number of tasks observed
+    */
+    inline size_t num_tasks() const;
+
+  private:
+    
+    inline void set_up(size_t num_workers) override final;
+    inline void on_entry(WorkerView w, TaskView task_view) override final;
+    inline void on_exit(WorkerView w, TaskView task_view) override final;
+
+    Timeline _timeline;
+};  
+    
+// constructor
+inline ChromeObserver::Segment::Segment(
+  const std::string& n, observer_stamp_t b, observer_stamp_t e
+) :
+  name {n}, beg {b}, end {e} {
+}
+
+// Procedure: set_up
+inline void ChromeObserver::set_up(size_t num_workers) {
+  _timeline.segments.resize(num_workers);
+  _timeline.stacks.resize(num_workers);
+
+  for(size_t w=0; w<num_workers; ++w) {
+    _timeline.segments[w].reserve(32);
+  }
+  
+  _timeline.origin = observer_stamp_t::clock::now();
+}
+
+// Procedure: on_entry
+inline void ChromeObserver::on_entry(WorkerView wv, TaskView) {
+  _timeline.stacks[wv.id()].push(observer_stamp_t::clock::now());
+}
+
+// Procedure: on_exit
+inline void ChromeObserver::on_exit(WorkerView wv, TaskView tv) {
+
+  size_t w = wv.id();
+
+  assert(!_timeline.stacks[w].empty());
+
+  auto beg = _timeline.stacks[w].top();
+  _timeline.stacks[w].pop();
+
+  _timeline.segments[w].emplace_back(
+    tv.name(), beg, observer_stamp_t::clock::now()
+  );
+}
+
+// Function: clear
+inline void ChromeObserver::clear() {
+  for(size_t w=0; w<_timeline.segments.size(); ++w) {
+    _timeline.segments[w].clear();
+    while(!_timeline.stacks[w].empty()) {
+      _timeline.stacks[w].pop();
+    }
+  }
+}
+
+// Procedure: dump
+inline void ChromeObserver::dump(std::ostream& os) const {
+
+  using namespace std::chrono;
+
+  size_t first;
+
+  for(first = 0; first<_timeline.segments.size(); ++first) {
+    if(_timeline.segments[first].size() > 0) { 
+      break; 
+    }
+  }
+
+  os << '[';
+
+  for(size_t w=first; w<_timeline.segments.size(); w++) {
+
+    if(w != first && _timeline.segments[w].size() > 0) {
+      os << ',';
+    }
+
+    for(size_t i=0; i<_timeline.segments[w].size(); i++) {
+
+      os << '{'<< "\"cat\":\"ChromeObserver\",";
+
+      // name field
+      os << "\"name\":\"";
+      if(_timeline.segments[w][i].name.empty()) {
+        os << w << '_' << i;
+      }
+      else {
+        os << _timeline.segments[w][i].name;
+      }
+      os << "\",";
+      
+      // segment field
+      os << "\"ph\":\"X\","
+         << "\"pid\":1,"
+         << "\"tid\":" << w << ','
+         << "\"ts\":" << duration_cast<microseconds>(
+                           _timeline.segments[w][i].beg - _timeline.origin
+                         ).count() << ','
+         << "\"dur\":" << duration_cast<microseconds>(
+                           _timeline.segments[w][i].end - _timeline.segments[w][i].beg
+                         ).count();
+
+      if(i != _timeline.segments[w].size() - 1) {
+        os << "},";
+      }
+      else {
+        os << '}';
+      }
+    }
+  }
+  os << "]\n";
+}
+
+// Function: dump
+inline std::string ChromeObserver::dump() const {
+  std::ostringstream oss;
+  dump(oss);
+  return oss.str();
+}
+
+// Function: num_tasks
+inline size_t ChromeObserver::num_tasks() const {
+  return std::accumulate(
+    _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, 
+    [](size_t sum, const auto& exe){ 
+      return sum + exe.size(); 
+    }
+  );
+}
+
+// ----------------------------------------------------------------------------
+// TFProfObserver definition
+// ----------------------------------------------------------------------------
+
+/**
+@class TFProfObserver
+
+@brief class to create an observer based on the built-in taskflow profiler format
+
+A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump
+the observed thread activities into a format that can be visualized through
+@TFProf.
+
+@code{.cpp}
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+// insert tasks into taskflow
+// ...
+  
+// create a custom observer
+std::shared_ptr<tf::TFProfObserver> observer = executor.make_observer<tf::TFProfObserver>();
+
+// run the taskflow
+executor.run(taskflow).wait();
+
+// dump the thread activities to Taskflow Profiler format.
+observer->dump(std::cout);
+@endcode
+
+*/
+class TFProfObserver : public ObserverInterface {
+
+  friend class Executor;
+  friend class TFProfManager;
+
+  /** @private overall task summary */
+  struct TaskSummary {
+    size_t count {0};
+    size_t total_span {0};
+    size_t min_span;
+    size_t max_span;
+    
+    float avg_span() const { return total_span * 1.0f / count; }
+  };
+
+  /** @private worker summary at a level */
+  struct WorkerSummary {
+
+    size_t id;
+    size_t level;
+    size_t count {0};
+    size_t total_span {0};
+    size_t min_span{0};
+    size_t max_span{0};
+
+    std::array<TaskSummary, TASK_TYPES.size()> tsum;
+
+    float avg_span() const { return total_span * 1.0f / count; }
+    //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); 
+  };
+  
+  /** @private */
+  struct Summary {
+    std::array<TaskSummary, TASK_TYPES.size()> tsum;
+    std::vector<WorkerSummary> wsum;
+    
+    void dump_tsum(std::ostream&) const;
+    void dump_wsum(std::ostream&) const;
+    void dump(std::ostream&) const;
+  };
+
+  public:
+
+    /**
+    @brief dumps the timelines into a @TFProf format through 
+           an output stream
+    */
+    void dump(std::ostream& ostream) const;
+
+    /**
+    @brief dumps the timelines into a JSON string
+    */
+    std::string dump() const;
+
+    /**
+    @brief shows the summary report through an output stream
+    */
+    void summary(std::ostream& ostream) const;
+
+    /**
+    @brief returns the summary report in a string
+    */
+    std::string summary() const;
+
+    /**
+    @brief clears the timeline data
+    */
+    void clear();
+
+    /**
+    @brief queries the number of tasks observed
+    */
+    size_t num_tasks() const;
+    
+    /**
+    @brief queries the number of observed workers
+    */
+    size_t num_workers() const;
+
+  private:
+    
+    Timeline _timeline;
+  
+    std::vector<std::stack<observer_stamp_t>> _stacks;
+    
+    inline void set_up(size_t num_workers) override final;
+    inline void on_entry(WorkerView, TaskView) override final;
+    inline void on_exit(WorkerView, TaskView) override final;
+};  
+
+
+// dump the task summary
+inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const {
+
+  // task summary
+  size_t type_w{10}, count_w{5}, time_w{9}, avg_w{8}, min_w{8}, max_w{8};
+
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    count_w = std::max(count_w, std::to_string(i.count).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    time_w = std::max(time_w, std::to_string(i.total_span).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    avg_w = std::max(time_w, std::to_string(i.avg_span()).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    min_w = std::max(min_w, std::to_string(i.min_span).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    max_w = std::max(max_w, std::to_string(i.max_span).size());
+  });
+
+  os << std::setw(type_w) << "-Task-" 
+     << std::setw(count_w+2) << "Count"
+     << std::setw(time_w+2) << "Time (us)"
+     << std::setw(avg_w+2) << "Avg (us)"
+     << std::setw(min_w+2) << "Min (us)"
+     << std::setw(max_w+2) << "Max (us)"
+     << '\n';
+
+  for(size_t i=0; i<TASK_TYPES.size(); i++) {
+    if(tsum[i].count == 0) {
+      continue;
+    }
+    os << std::setw(type_w) << to_string(TASK_TYPES[i])
+       << std::setw(count_w+2) << tsum[i].count
+       << std::setw(time_w+2) << tsum[i].total_span
+       << std::setw(avg_w+2) << std::to_string(tsum[i].avg_span())
+       << std::setw(min_w+2) << tsum[i].min_span
+       << std::setw(max_w+2) << tsum[i].max_span
+       << '\n';
+  }
+}
+
+// dump the worker summary
+inline void TFProfObserver::Summary::dump_wsum(std::ostream& os) const {
+  
+  // task summary
+  size_t w_w{10}, t_w{10}, l_w{5}, c_w{5}, d_w{9}, avg_w{8}, min_w{8}, max_w{8};
+
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    l_w = std::max(l_w, std::to_string(i.level).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    c_w = std::max(c_w, std::to_string(i.count).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    d_w = std::max(d_w, std::to_string(i.total_span).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    avg_w = std::max(avg_w, std::to_string(i.avg_span()).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    min_w = std::max(min_w, std::to_string(i.min_span).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    max_w = std::max(max_w, std::to_string(i.max_span).size());
+  });
+  
+  os << std::setw(w_w) << "-Worker-" 
+     << std::setw(l_w+2) << "Level"
+     << std::setw(t_w) << "Task"
+     << std::setw(c_w+2) << "Count"
+     << std::setw(d_w+2) << "Time (us)"
+     << std::setw(avg_w+2) << "Avg (us)"
+     << std::setw(min_w+2) << "Min (us)"
+     << std::setw(max_w+2) << "Max (us)"
+     << '\n';
+
+  for(const auto& ws : wsum) {
+
+    if(ws.count == 0) {
+      continue;
+    }
+
+    os << std::setw(w_w) << ws.id
+       << std::setw(l_w+2) << ws.level;
+    
+    bool first = true;
+    for(size_t i=0; i<TASK_TYPES.size(); i++) {
+
+      if(ws.tsum[i].count == 0) {
+        continue;
+      }
+
+      os << (first ? std::setw(t_w) : std::setw(w_w + l_w + 2 + t_w));
+      first = false;
+
+      os << to_string(TASK_TYPES[i])
+         << std::setw(c_w+2) << ws.tsum[i].count
+         << std::setw(d_w+2) << ws.tsum[i].total_span
+         << std::setw(avg_w+2) << std::to_string(ws.tsum[i].avg_span())
+         << std::setw(min_w+2) << ws.tsum[i].min_span
+         << std::setw(max_w+2) << ws.tsum[i].max_span
+         << '\n';
+    }
+
+    // per-worker summary
+    os << std::setw(w_w + l_w + t_w + c_w + 4) << ws.count
+       << std::setw(d_w+2) << ws.total_span
+       << std::setw(avg_w+2) << std::to_string(ws.avg_span())
+       << std::setw(min_w+2) << ws.min_span
+       << std::setw(max_w+2) << ws.max_span
+       << '\n';
+    
+    //for(size_t j=0; j<w_w+l_w+t_w+4; j++) os << ' ';
+    //for(size_t j=0; j<c_w+d_w+avg_w+min_w+max_w+8; j++) os << '-';
+    //os <<'\n';
+  }
+}
+
+// dump the summary report through an ostream
+inline void TFProfObserver::Summary::dump(std::ostream& os) const {
+  dump_tsum(os);
+  os << '\n';
+  dump_wsum(os);
+}
+
+// Procedure: set_up
+inline void TFProfObserver::set_up(size_t num_workers) {
+  _timeline.uid = unique_id<size_t>();
+  _timeline.origin = observer_stamp_t::clock::now();
+  _timeline.segments.resize(num_workers);
+  _stacks.resize(num_workers);
+}
+
+// Procedure: on_entry
+inline void TFProfObserver::on_entry(WorkerView wv, TaskView) {
+  _stacks[wv.id()].push(observer_stamp_t::clock::now());
+}
+
+// Procedure: on_exit
+inline void TFProfObserver::on_exit(WorkerView wv, TaskView tv) {
+
+  size_t w = wv.id();
+
+  assert(!_stacks[w].empty());
+  
+  if(_stacks[w].size() > _timeline.segments[w].size()) {
+    _timeline.segments[w].resize(_stacks[w].size());
+  }
+
+  auto beg = _stacks[w].top();
+  _stacks[w].pop();
+
+  _timeline.segments[w][_stacks[w].size()].emplace_back(
+    tv.name(), tv.type(), beg, observer_stamp_t::clock::now()
+  );
+}
+
+// Function: clear
+inline void TFProfObserver::clear() {
+  for(size_t w=0; w<_timeline.segments.size(); ++w) {
+    for(size_t l=0; l<_timeline.segments[w].size(); ++l) {
+      _timeline.segments[w][l].clear();
+    }
+    while(!_stacks[w].empty()) {
+      _stacks[w].pop();
+    }
+  }
+}
+
+// Procedure: dump
+inline void TFProfObserver::dump(std::ostream& os) const {
+
+  using namespace std::chrono;
+
+  size_t first;
+
+  for(first = 0; first<_timeline.segments.size(); ++first) {
+    if(_timeline.segments[first].size() > 0) { 
+      break; 
+    }
+  }
+  
+  // not timeline data to dump
+  if(first == _timeline.segments.size()) {
+    os << "{}\n";
+    return;
+  }
+
+  os << "{\"executor\":\"" << _timeline.uid << "\",\"data\":[";
+
+  bool comma = false;
+
+  for(size_t w=first; w<_timeline.segments.size(); w++) {
+    for(size_t l=0; l<_timeline.segments[w].size(); l++) {
+
+      if(_timeline.segments[w][l].empty()) {
+        continue;
+      }
+
+      if(comma) {
+        os << ',';
+      }
+      else {
+        comma = true;
+      }
+
+      os << "{\"worker\":" << w << ",\"level\":" << l << ",\"data\":[";
+      for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) {
+
+        const auto& s = _timeline.segments[w][l][i];
+
+        if(i) os << ',';
+        
+        // span 
+        os << "{\"span\":[" 
+           << duration_cast<microseconds>(s.beg - _timeline.origin).count() 
+           << ","
+           << duration_cast<microseconds>(s.end - _timeline.origin).count() 
+           << "],";
+        
+        // name
+        os << "\"name\":\""; 
+        if(s.name.empty()) {
+          os << w << '_' << i;
+        }
+        else {
+          os << s.name;
+        }
+        os << "\",";
+    
+        // e.g., category "type": "Condition Task"
+        os << "\"type\":\"" << to_string(s.type) << "\"";
+
+        os << "}";
+      }
+      os << "]}";
+    }
+  }
+
+  os << "]}\n";
+}
+
+// Function: dump
+inline std::string TFProfObserver::dump() const {
+  std::ostringstream oss;
+  dump(oss);
+  return oss.str();
+}
+
+// Procedure: summary
+inline void TFProfObserver::summary(std::ostream& os) const {
+
+  using namespace std::chrono;
+  
+  Summary summary;
+  std::optional<observer_stamp_t> view_beg, view_end;
+
+  // find the first non-empty worker
+  size_t first;
+  for(first = 0; first<_timeline.segments.size(); ++first) {
+    if(_timeline.segments[first].size() > 0) { 
+      break; 
+    }
+  }
+  
+  // not timeline data to dump
+  if(first == _timeline.segments.size()) {
+    goto end_of_summary;
+  }
+
+  for(size_t w=first; w<_timeline.segments.size(); w++) {
+    for(size_t l=0; l<_timeline.segments[w].size(); l++) {
+
+      if(_timeline.segments[w][l].empty()) {
+        continue;
+      }
+
+      // worker w at level l
+      WorkerSummary ws;
+      ws.id = w;
+      ws.level = l;
+      ws.count = _timeline.segments[w][l].size();
+      
+      // scan all tasks at level l
+      for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) {
+        
+        // update the entire span
+        auto& s = _timeline.segments[w][l][i];
+        view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg;
+        view_end = view_end ? std::max(*view_end, s.end) : s.end;
+        
+        // update the task summary
+        size_t t = duration_cast<microseconds>(s.end - s.beg).count();
+
+        auto& x = summary.tsum[static_cast<int>(s.type)];
+        x.count += 1;
+        x.total_span += t;
+        x.min_span = (x.count == 1) ? t : std::min(t, x.min_span);
+        x.max_span = (x.count == 1) ? t : std::max(t, x.max_span);
+
+        // update the worker summary
+        ws.total_span += t;
+        ws.min_span = (i == 0) ? t : std::min(t, ws.min_span);
+        ws.max_span = (i == 0) ? t : std::max(t, ws.max_span);
+
+        auto&y = ws.tsum[static_cast<int>(s.type)];
+        y.count += 1;
+        y.total_span += t;
+        y.min_span = (y.count == 1) ? t : std::min(t, y.min_span);
+        y.max_span = (y.count == 1) ? t : std::max(t, y.max_span);
+        
+        // update the delay
+        //if(i) {
+        //  size_t d = duration_cast<nanoseconds>(
+        //    s.beg - _timeline.segments[w][l][i-1].end
+        //  ).count();
+        //  ws.total_delay += d;
+        //  ws.min_delay = (i == 1) ? d : std::min(ws.min_delay, d);
+        //  ws.max_delay = (i == 1) ? d : std::max(ws.max_delay, d);
+        //}
+      }
+      summary.wsum.push_back(ws);
+    }
+  }
+
+  end_of_summary:
+
+  size_t view = 0;
+  if(view_beg && view_end) {
+    view = duration_cast<microseconds>(*view_end - *view_beg).count();
+  }
+
+  os << "==Observer " << _timeline.uid << ": "
+     << num_workers() << " workers completed "
+     << num_tasks() << " tasks in "
+     << view << " us\n";
+
+  summary.dump(os);
+}
+
+// Procedure: summary
+inline std::string TFProfObserver::summary() const {
+  std::ostringstream oss;
+  summary(oss);
+  return oss.str();
+}
+
+// Function: num_tasks
+inline size_t TFProfObserver::num_tasks() const {
+  size_t s = 0;
+  for(size_t w=0; w<_timeline.segments.size(); ++w) {
+    for(size_t l=0; l<_timeline.segments[w].size(); ++l) {
+      s += _timeline.segments[w][l].size();
+    }
+  }
+  return s;
+}
+  
+// Function: num_workers
+inline size_t TFProfObserver::num_workers() const {
+  size_t w = 0;
+  for(size_t i=0; i<_timeline.segments.size(); ++i) {
+    w += (!_timeline.segments[i].empty());
+  }
+  return w;
+}
+
+
+// ----------------------------------------------------------------------------
+// TFProfManager
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+class TFProfManager {
+
+  friend class Executor;
+
+  public:
+    
+    ~TFProfManager();
+    
+    TFProfManager(const TFProfManager&) = delete;
+    TFProfManager& operator=(const TFProfManager&) = delete;
+
+    static TFProfManager& get();
+
+    void dump(std::ostream& ostream) const;
+
+  private:
+    
+    const std::string _fpath;
+
+    std::mutex _mutex;
+    std::vector<std::shared_ptr<TFProfObserver>> _observers;
+    
+    TFProfManager();
+
+    void _manage(std::shared_ptr<TFProfObserver> observer);
+};
+
+// constructor
+inline TFProfManager::TFProfManager() :
+  _fpath {get_env(TF_ENABLE_PROFILER)} {
+
+}
+
+// Procedure: manage
+inline void TFProfManager::_manage(std::shared_ptr<TFProfObserver> observer) {
+  std::lock_guard lock(_mutex);
+  _observers.push_back(std::move(observer));
+}
+
+// Procedure: dump
+inline void TFProfManager::dump(std::ostream& os) const {
+  for(size_t i=0; i<_observers.size(); ++i) {
+    if(i) os << ',';
+    _observers[i]->dump(os); 
+  }
+}
+
+// Destructor
+inline TFProfManager::~TFProfManager() {
+  std::ofstream ofs(_fpath);
+  if(ofs) {
+    // .tfp
+    if(_fpath.rfind(".tfp") != std::string::npos) {
+      ProfileData data;
+      data.timelines.reserve(_observers.size());
+      for(size_t i=0; i<_observers.size(); ++i) {
+        data.timelines.push_back(std::move(_observers[i]->_timeline));
+      }
+      Serializer<std::ofstream> serializer(ofs); 
+      serializer(data);
+    }
+    // .json
+    else { // if(_fpath.rfind(".json") != std::string::npos) {
+      ofs << "[\n";
+      for(size_t i=0; i<_observers.size(); ++i) {
+        if(i) ofs << ',';
+        _observers[i]->dump(ofs);
+      }
+      ofs << "]\n";
+    }
+  }
+  // do a summary report in stderr for each observer
+  else {
+    std::ostringstream oss;
+    for(size_t i=0; i<_observers.size(); ++i) {
+      _observers[i]->summary(oss);
+    }
+    fprintf(stderr, "%s", oss.str().c_str());
+  }
+}
+    
+// Function: get
+inline TFProfManager& TFProfManager::get() {
+  static TFProfManager mgr;
+  return mgr;
+}
+
+// ----------------------------------------------------------------------------
+// Identifier for Each Built-in Observer
+// ----------------------------------------------------------------------------
+
+/** @enum ObserverType
+
+@brief enumeration of all observer types
+
+*/
+enum class ObserverType : int {
+  TFPROF = 0,
+  CHROME,
+  UNDEFINED
+};
+
+/**
+@brief convert an observer type to a human-readable string
+*/
+inline const char* to_string(ObserverType type) {
+  switch(type) {
+    case ObserverType::TFPROF: return "tfprof";
+    case ObserverType::CHROME: return "chrome";
+    default:                   return "undefined";
+  }
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/core/semaphore.hpp b/myxpcs/include/taskflow_/core/semaphore.hpp
new file mode 100644
index 0000000..12d6069
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/semaphore.hpp
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <vector>
+#include <mutex>
+
+#include "declarations.hpp"
+
+/**
+@file semaphore.hpp
+@brief semaphore include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Semaphore
+// ----------------------------------------------------------------------------
+
+/**
+@class Semaphore
+
+@brief class to create a semophore object for building a concurrency constraint
+
+A semaphore creates a constraint that limits the maximum concurrency,
+i.e., the number of workers, in a set of tasks.
+You can let a task acquire/release one or multiple semaphores before/after
+executing its work.
+A task can acquire and release a semaphore,
+or just acquire or just release it.
+A tf::Semaphore object starts with an initial count.
+As long as that count is above 0, tasks can acquire the semaphore and do
+their work.
+If the count is 0 or less, a task trying to acquire the semaphore will not run
+but goes to a waiting list of that semaphore.
+When the semaphore is released by another task,
+it reschedules all tasks on that waiting list.
+
+@code{.cpp}
+tf::Executor executor(8);   // create an executor of 8 workers
+tf::Taskflow taskflow;
+
+tf::Semaphore semaphore(1); // create a semaphore with initial count 1
+
+std::vector<tf::Task> tasks {
+  taskflow.emplace([](){ std::cout << "A" << std::endl; }),
+  taskflow.emplace([](){ std::cout << "B" << std::endl; }),
+  taskflow.emplace([](){ std::cout << "C" << std::endl; }),
+  taskflow.emplace([](){ std::cout << "D" << std::endl; }),
+  taskflow.emplace([](){ std::cout << "E" << std::endl; })
+};
+
+for(auto & task : tasks) {  // each task acquires and release the semaphore
+  task.acquire(semaphore);
+  task.release(semaphore);
+}
+
+executor.run(taskflow).wait();
+@endcode
+
+The above example creates five tasks with no dependencies between them.
+Under normal circumstances, the five tasks would be executed concurrently.
+However, this example has a semaphore with initial count 1,
+and all tasks need to acquire that semaphore before running and release that
+semaphore after they are done.
+This arrangement limits the number of concurrently running tasks to only one.
+
+*/
+class Semaphore {
+
+  friend class Node;
+
+  public:
+
+    /**
+    @brief constructs a semaphore with the given counter
+
+    A semaphore creates a constraint that limits the maximum concurrency,
+    i.e., the number of workers, in a set of tasks.
+
+    @code{.cpp}
+    tf::Semaphore semaphore(4);  // concurrency constraint of 4 workers
+    @endcode
+    */
+    explicit Semaphore(size_t max_workers);
+
+    /**
+    @brief queries the counter value (not thread-safe during the run)
+    */
+    size_t count() const;
+
+  private:
+
+    std::mutex _mtx;
+
+    size_t _counter;
+
+    std::vector<Node*> _waiters;
+
+    bool _try_acquire_or_wait(Node*);
+
+    std::vector<Node*> _release();
+};
+
+inline Semaphore::Semaphore(size_t max_workers) :
+  _counter(max_workers) {
+}
+
+inline bool Semaphore::_try_acquire_or_wait(Node* me) {
+  std::lock_guard<std::mutex> lock(_mtx);
+  if(_counter > 0) {
+    --_counter;
+    return true;
+  }
+  else {
+    _waiters.push_back(me);
+    return false;
+  }
+}
+
+inline std::vector<Node*> Semaphore::_release() {
+  std::lock_guard<std::mutex> lock(_mtx);
+  ++_counter;
+  std::vector<Node*> r{std::move(_waiters)};
+  return r;
+}
+
+inline size_t Semaphore::count() const {
+  return _counter;
+}
+
+}  // end of namespace tf. ---------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/core/task.hpp b/myxpcs/include/taskflow_/core/task.hpp
new file mode 100644
index 0000000..f69d9a6
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/task.hpp
@@ -0,0 +1,776 @@
+#pragma once
+
+#include "graph.hpp"
+
+/**
+@file task.hpp
+@brief task include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Task Types
+// ----------------------------------------------------------------------------
+
+/**
+@enum TaskType
+
+@brief enumeration of all task types
+*/
+enum class TaskType : int {
+  /** @brief placeholder task type */
+  PLACEHOLDER = 0,
+  /** @brief static task type */
+  STATIC,
+  /** @brief dynamic (subflow) task type */
+  DYNAMIC,
+  /** @brief condition task type */
+  CONDITION,
+  /** @brief module task type */
+  MODULE,
+  /** @brief asynchronous task type */
+  ASYNC,
+  /** @brief undefined task type (for internal use only) */
+  UNDEFINED
+};
+
+/**
+@private
+@brief array of all task types (used for iterating task types)
+*/
+inline constexpr std::array<TaskType, 6> TASK_TYPES = {
+  TaskType::PLACEHOLDER,
+  TaskType::STATIC,
+  TaskType::DYNAMIC,
+  TaskType::CONDITION,
+  TaskType::MODULE,
+  TaskType::ASYNC,
+};
+
+/**
+@brief convert a task type to a human-readable string
+
+The name of each task type is the litte-case string of its characters.
+
+@code{.cpp}
+TaskType::PLACEHOLDER     ->  "placeholder"
+TaskType::STATIC          ->  "static"
+TaskType::DYNAMIC         ->  "subflow"
+TaskType::CONDITION       ->  "condition"
+TaskType::MODULE          ->  "module"
+TaskType::ASYNC           ->  "async"
+@endcode
+*/
+inline const char* to_string(TaskType type) {
+
+  const char* val;
+
+  switch(type) {
+    case TaskType::PLACEHOLDER:      val = "placeholder";     break;
+    case TaskType::STATIC:           val = "static";          break;
+    case TaskType::DYNAMIC:          val = "subflow";         break;
+    case TaskType::CONDITION:        val = "condition";       break;
+    case TaskType::MODULE:           val = "module";          break;
+    case TaskType::ASYNC:            val = "async";           break;
+    default:                         val = "undefined";       break;
+  }
+
+  return val;
+}
+
+// ----------------------------------------------------------------------------
+// Task Traits
+// ----------------------------------------------------------------------------
+
+/**
+@brief determines if a callable is a dynamic task
+
+A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
+*/
+template <typename C>
+constexpr bool is_dynamic_task_v = 
+  std::is_invocable_r_v<void, C, Subflow&> &&
+  !std::is_invocable_r_v<void, C, Runtime&>;
+
+/**
+@brief determines if a callable is a condition task
+
+A condition task is a callable object constructible from std::function<int()>
+or std::function<int(tf::Runtime&)>.
+*/
+template <typename C>
+constexpr bool is_condition_task_v = 
+  (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) &&
+  !is_dynamic_task_v<C>;
+
+/**
+@brief determines if a callable is a multi-condition task
+
+A multi-condition task is a callable object constructible from
+std::function<tf::SmallVector<int>()> or
+std::function<tf::SmallVector<int>(tf::Runtime&)>.
+*/
+template <typename C>
+constexpr bool is_multi_condition_task_v =
+  (std::is_invocable_r_v<SmallVector<int>, C> ||
+  std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) &&
+  !is_dynamic_task_v<C>;
+
+/**
+@brief determines if a callable is a static task
+
+A static task is a callable object constructible from std::function<void()>
+or std::function<void(tf::Runtime&)>.
+*/
+template <typename C>
+constexpr bool is_static_task_v =
+  (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) &&
+  !is_condition_task_v<C> &&
+  !is_multi_condition_task_v<C> &&
+  !is_dynamic_task_v<C>;
+
+// ----------------------------------------------------------------------------
+// Task
+// ----------------------------------------------------------------------------
+
+/**
+@class Task
+
+@brief class to create a task handle over a node in a taskflow graph
+
+A task is a wrapper over a node in a taskflow graph.
+It provides a set of methods for users to access and modify the attributes of
+the associated node in the taskflow graph.
+A task is very lightweight object (i.e., only storing a node pointer) that
+can be trivially copied around,
+and it does not own the lifetime of the associated node.
+*/
+class Task {
+
+  friend class FlowBuilder;
+  friend class Runtime;
+  friend class Taskflow;
+  friend class TaskView;
+  friend class Executor;
+
+  public:
+
+    /**
+    @brief constructs an empty task
+    */
+    Task() = default;
+
+    /**
+    @brief constructs the task with the copy of the other task
+    */
+    Task(const Task& other);
+
+    /**
+    @brief replaces the contents with a copy of the other task
+    */
+    Task& operator = (const Task&);
+
+    /**
+    @brief replaces the contents with a null pointer
+    */
+    Task& operator = (std::nullptr_t);
+
+    /**
+    @brief compares if two tasks are associated with the same graph node
+    */
+    bool operator == (const Task& rhs) const;
+
+    /**
+    @brief compares if two tasks are not associated with the same graph node
+    */
+    bool operator != (const Task& rhs) const;
+
+    /**
+    @brief queries the name of the task
+    */
+    const std::string& name() const;
+
+    /**
+    @brief queries the number of successors of the task
+    */
+    size_t num_successors() const;
+
+    /**
+    @brief queries the number of predecessors of the task
+    */
+    size_t num_dependents() const;
+
+    /**
+    @brief queries the number of strong dependents of the task
+    */
+    size_t num_strong_dependents() const;
+
+    /**
+    @brief queries the number of weak dependents of the task
+    */
+    size_t num_weak_dependents() const;
+
+    /**
+    @brief assigns a name to the task
+
+    @param name a @std_string acceptable string
+
+    @return @c *this
+    */
+    Task& name(const std::string& name);
+
+    /**
+    @brief assigns a callable
+
+    @tparam C callable type
+
+    @param callable callable to construct a task
+
+    @return @c *this
+    */
+    template <typename C>
+    Task& work(C&& callable);
+
+    /**
+    @brief creates a module task from a taskflow
+
+    @tparam T object type
+    @param object a custom object that defines @c T::graph() method
+
+    @return @c *this
+    */
+    template <typename T>
+    Task& composed_of(T& object);
+
+    /**
+    @brief adds precedence links from this to other tasks
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    Task& precede(Ts&&... tasks);
+
+    /**
+    @brief adds precedence links from other tasks to this
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    Task& succeed(Ts&&... tasks);
+
+    /**
+    @brief makes the task release this semaphore
+    */
+    Task& release(Semaphore& semaphore);
+
+    /**
+    @brief makes the task acquire this semaphore
+    */
+    Task& acquire(Semaphore& semaphore);
+
+    /**
+    @brief assigns pointer to user data
+
+    @param data pointer to user data
+
+    The following example shows how to attach user data to a task and
+    run the task iteratively while changing the data value:
+
+    @code{.cpp}
+    tf::Executor executor;
+    tf::Taskflow taskflow("attach data to a task");
+
+    int data;
+
+    // create a task and attach it the data
+    auto A = taskflow.placeholder();
+    A.data(&data).work([A](){
+      auto d = *static_cast<int*>(A.data());
+      std::cout << "data is " << d << std::endl;
+    });
+
+    // run the taskflow iteratively with changing data
+    for(data = 0; data<10; data++){
+      executor.run(taskflow).wait();
+    }
+    @endcode
+
+    @return @c *this
+    */
+    Task& data(void* data);
+    
+    /**
+    @brief assigns a priority value to the task
+
+    A priority value can be one of the following three levels, 
+    tf::TaskPriority::HIGH (numerically equivalent to 0),
+    tf::TaskPriority::NORMAL (numerically equivalent to 1), and
+    tf::TaskPriority::LOW (numerically equivalent to 2).
+    The smaller the priority value, the higher the priority.
+    */
+    Task& priority(TaskPriority p);
+    
+    /**
+    @brief queries the priority value of the task
+    */
+    TaskPriority priority() const;
+
+    /**
+    @brief resets the task handle to null
+    */
+    void reset();
+
+    /**
+    @brief resets the associated work to a placeholder
+    */
+    void reset_work();
+
+    /**
+    @brief queries if the task handle points to a task node
+    */
+    bool empty() const;
+
+    /**
+    @brief queries if the task has a work assigned
+    */
+    bool has_work() const;
+
+    /**
+    @brief applies an visitor callable to each successor of the task
+    */
+    template <typename V>
+    void for_each_successor(V&& visitor) const;
+
+    /**
+    @brief applies an visitor callable to each dependents of the task
+    */
+    template <typename V>
+    void for_each_dependent(V&& visitor) const;
+
+    /**
+    @brief obtains a hash value of the underlying node
+    */
+    size_t hash_value() const;
+
+    /**
+    @brief returns the task type
+    */
+    TaskType type() const;
+
+    /**
+    @brief dumps the task through an output stream
+    */
+    void dump(std::ostream& ostream) const;
+
+    /**
+    @brief queries pointer to user data
+    */
+    void* data() const;
+
+
+  private:
+
+    Task(Node*);
+
+    Node* _node {nullptr};
+};
+
+// Constructor
+inline Task::Task(Node* node) : _node {node} {
+}
+
+// Constructor
+inline Task::Task(const Task& rhs) : _node {rhs._node} {
+}
+
+// Function: precede
+template <typename... Ts>
+Task& Task::precede(Ts&&... tasks) {
+  (_node->_precede(tasks._node), ...);
+  //_precede(std::forward<Ts>(tasks)...);
+  return *this;
+}
+
+// Function: succeed
+template <typename... Ts>
+Task& Task::succeed(Ts&&... tasks) {
+  (tasks._node->_precede(_node), ...);
+  //_succeed(std::forward<Ts>(tasks)...);
+  return *this;
+}
+
+// Function: composed_of
+template <typename T>
+Task& Task::composed_of(T& object) {
+  _node->_handle.emplace<Node::Module>(object);
+  return *this;
+}
+
+// Operator =
+inline Task& Task::operator = (const Task& rhs) {
+  _node = rhs._node;
+  return *this;
+}
+
+// Operator =
+inline Task& Task::operator = (std::nullptr_t ptr) {
+  _node = ptr;
+  return *this;
+}
+
+// Operator ==
+inline bool Task::operator == (const Task& rhs) const {
+  return _node == rhs._node;
+}
+
+// Operator !=
+inline bool Task::operator != (const Task& rhs) const {
+  return _node != rhs._node;
+}
+
+// Function: name
+inline Task& Task::name(const std::string& name) {
+  _node->_name = name;
+  return *this;
+}
+
+// Function: acquire
+inline Task& Task::acquire(Semaphore& s) {
+  if(!_node->_semaphores) {
+    _node->_semaphores = std::make_unique<Node::Semaphores>();
+  }
+  _node->_semaphores->to_acquire.push_back(&s);
+  return *this;
+}
+
+// Function: release
+inline Task& Task::release(Semaphore& s) {
+  if(!_node->_semaphores) {
+    //_node->_semaphores.emplace();
+    _node->_semaphores = std::make_unique<Node::Semaphores>();
+  }
+  _node->_semaphores->to_release.push_back(&s);
+  return *this;
+}
+
+// Procedure: reset
+inline void Task::reset() {
+  _node = nullptr;
+}
+
+// Procedure: reset_work
+inline void Task::reset_work() {
+  _node->_handle.emplace<std::monostate>();
+}
+
+// Function: name
+inline const std::string& Task::name() const {
+  return _node->_name;
+}
+
+// Function: num_dependents
+inline size_t Task::num_dependents() const {
+  return _node->num_dependents();
+}
+
+// Function: num_strong_dependents
+inline size_t Task::num_strong_dependents() const {
+  return _node->num_strong_dependents();
+}
+
+// Function: num_weak_dependents
+inline size_t Task::num_weak_dependents() const {
+  return _node->num_weak_dependents();
+}
+
+// Function: num_successors
+inline size_t Task::num_successors() const {
+  return _node->num_successors();
+}
+
+// Function: empty
+inline bool Task::empty() const {
+  return _node == nullptr;
+}
+
+// Function: has_work
+inline bool Task::has_work() const {
+  return _node ? _node->_handle.index() != 0 : false;
+}
+
+// Function: task_type
+inline TaskType Task::type() const {
+  switch(_node->_handle.index()) {
+    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
+    case Node::STATIC:          return TaskType::STATIC;
+    case Node::DYNAMIC:         return TaskType::DYNAMIC;
+    case Node::CONDITION:       return TaskType::CONDITION;
+    case Node::MULTI_CONDITION: return TaskType::CONDITION;
+    case Node::MODULE:          return TaskType::MODULE;
+    case Node::ASYNC:           return TaskType::ASYNC;
+    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
+    default:                    return TaskType::UNDEFINED;
+  }
+}
+
+// Function: for_each_successor
+template <typename V>
+void Task::for_each_successor(V&& visitor) const {
+  for(size_t i=0; i<_node->_successors.size(); ++i) {
+    visitor(Task(_node->_successors[i]));
+  }
+}
+
+// Function: for_each_dependent
+template <typename V>
+void Task::for_each_dependent(V&& visitor) const {
+  for(size_t i=0; i<_node->_dependents.size(); ++i) {
+    visitor(Task(_node->_dependents[i]));
+  }
+}
+
+// Function: hash_value
+inline size_t Task::hash_value() const {
+  return std::hash<Node*>{}(_node);
+}
+
+// Procedure: dump
+inline void Task::dump(std::ostream& os) const {
+  os << "task ";
+  if(name().empty()) os << _node;
+  else os << name();
+  os << " [type=" << to_string(type()) << ']';
+}
+
+// Function: work
+template <typename C>
+Task& Task::work(C&& c) {
+
+  if constexpr(is_static_task_v<C>) {
+    _node->_handle.emplace<Node::Static>(std::forward<C>(c));
+  }
+  else if constexpr(is_dynamic_task_v<C>) {
+    _node->_handle.emplace<Node::Dynamic>(std::forward<C>(c));
+  }
+  else if constexpr(is_condition_task_v<C>) {
+    _node->_handle.emplace<Node::Condition>(std::forward<C>(c));
+  }
+  else if constexpr(is_multi_condition_task_v<C>) {
+    _node->_handle.emplace<Node::MultiCondition>(std::forward<C>(c));
+  }
+  else {
+    static_assert(dependent_false_v<C>, "invalid task callable");
+  }
+  return *this;
+}
+
+// Function: data
+inline void* Task::data() const {
+  return _node->_data;
+}
+
+// Function: data
+inline Task& Task::data(void* data) {
+  _node->_data = data;
+  return *this;
+}
+
+// Function: priority
+inline Task& Task::priority(TaskPriority p) {
+  _node->_priority = static_cast<unsigned>(p);
+  return *this;
+}
+
+// Function: priority
+inline TaskPriority Task::priority() const {
+  return static_cast<TaskPriority>(_node->_priority);
+}
+
+// ----------------------------------------------------------------------------
+// global ostream
+// ----------------------------------------------------------------------------
+
+/**
+@brief overload of ostream inserter operator for Task
+*/
+inline std::ostream& operator << (std::ostream& os, const Task& task) {
+  task.dump(os);
+  return os;
+}
+
+// ----------------------------------------------------------------------------
+// Task View
+// ----------------------------------------------------------------------------
+
+/**
+@class TaskView
+
+@brief class to access task information from the observer interface
+*/
+class TaskView {
+
+  friend class Executor;
+
+  public:
+
+    /**
+    @brief queries the name of the task
+    */
+    const std::string& name() const;
+
+    /**
+    @brief queries the number of successors of the task
+    */
+    size_t num_successors() const;
+
+    /**
+    @brief queries the number of predecessors of the task
+    */
+    size_t num_dependents() const;
+
+    /**
+    @brief queries the number of strong dependents of the task
+    */
+    size_t num_strong_dependents() const;
+
+    /**
+    @brief queries the number of weak dependents of the task
+    */
+    size_t num_weak_dependents() const;
+
+    /**
+    @brief applies an visitor callable to each successor of the task
+    */
+    template <typename V>
+    void for_each_successor(V&& visitor) const;
+
+    /**
+    @brief applies an visitor callable to each dependents of the task
+    */
+    template <typename V>
+    void for_each_dependent(V&& visitor) const;
+
+    /**
+    @brief queries the task type
+    */
+    TaskType type() const;
+
+    /**
+    @brief obtains a hash value of the underlying node
+    */
+    size_t hash_value() const;
+
+  private:
+
+    TaskView(const Node&);
+    TaskView(const TaskView&) = default;
+
+    const Node& _node;
+};
+
+// Constructor
+inline TaskView::TaskView(const Node& node) : _node {node} {
+}
+
+// Function: name
+inline const std::string& TaskView::name() const {
+  return _node._name;
+}
+
+// Function: num_dependents
+inline size_t TaskView::num_dependents() const {
+  return _node.num_dependents();
+}
+
+// Function: num_strong_dependents
+inline size_t TaskView::num_strong_dependents() const {
+  return _node.num_strong_dependents();
+}
+
+// Function: num_weak_dependents
+inline size_t TaskView::num_weak_dependents() const {
+  return _node.num_weak_dependents();
+}
+
+// Function: num_successors
+inline size_t TaskView::num_successors() const {
+  return _node.num_successors();
+}
+
+// Function: type
+inline TaskType TaskView::type() const {
+  switch(_node._handle.index()) {
+    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
+    case Node::STATIC:          return TaskType::STATIC;
+    case Node::DYNAMIC:         return TaskType::DYNAMIC;
+    case Node::CONDITION:       return TaskType::CONDITION;
+    case Node::MULTI_CONDITION: return TaskType::CONDITION;
+    case Node::MODULE:          return TaskType::MODULE;
+    case Node::ASYNC:           return TaskType::ASYNC;
+    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
+    default:                    return TaskType::UNDEFINED;
+  }
+}
+
+// Function: hash_value
+inline size_t TaskView::hash_value() const {
+  return std::hash<const Node*>{}(&_node);
+}
+
+// Function: for_each_successor
+template <typename V>
+void TaskView::for_each_successor(V&& visitor) const {
+  for(size_t i=0; i<_node._successors.size(); ++i) {
+    visitor(TaskView(*_node._successors[i]));
+  }
+}
+
+// Function: for_each_dependent
+template <typename V>
+void TaskView::for_each_dependent(V&& visitor) const {
+  for(size_t i=0; i<_node._dependents.size(); ++i) {
+    visitor(TaskView(*_node._dependents[i]));
+  }
+}
+
+}  // end of namespace tf. ---------------------------------------------------
+
+namespace std {
+
+/**
+@struct hash
+
+@brief hash specialization for std::hash<tf::Task>
+*/
+template <>
+struct hash<tf::Task> {
+  auto operator() (const tf::Task& task) const noexcept {
+    return task.hash_value();
+  }
+};
+
+/**
+@struct hash
+
+@brief hash specialization for std::hash<tf::TaskView>
+*/
+template <>
+struct hash<tf::TaskView> {
+  auto operator() (const tf::TaskView& task_view) const noexcept {
+    return task_view.hash_value();
+  }
+};
+
+}  // end of namespace std ----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/core/taskflow.hpp b/myxpcs/include/taskflow_/core/taskflow.hpp
new file mode 100644
index 0000000..b34381d
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/taskflow.hpp
@@ -0,0 +1,643 @@
+#pragma once
+
+#include "flow_builder.hpp"
+
+/**
+@file taskflow/core/taskflow.hpp
+@brief taskflow include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+
+/**
+@class Taskflow
+
+@brief class to create a taskflow object
+
+A %taskflow manages a task dependency graph where each task represents a
+callable object (e.g., @std_lambda, @std_function) and an edge represents a
+dependency between two tasks. A task is one of the following types:
+
+  1. static task         : the callable constructible from
+                           @c std::function<void()>
+  2. dynamic task        : the callable constructible from
+                           @c std::function<void(tf::Subflow&)>
+  3. condition task      : the callable constructible from
+                           @c std::function<int()>
+  4. multi-condition task: the callable constructible from
+                           @c %std::function<tf::SmallVector<int>()>
+  5. module task         : the task constructed from tf::Taskflow::composed_of
+                           @c std::function<void(tf::Runtime&)>
+
+Each task is a basic computation unit and is run by one worker thread
+from an executor.
+The following example creates a simple taskflow graph of four static tasks,
+@c A, @c B, @c C, and @c D, where
+@c A runs before @c B and @c C and
+@c D runs after  @c B and @c C.
+
+@code{.cpp}
+tf::Executor executor;
+tf::Taskflow taskflow("simple");
+
+tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; });
+tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; });
+tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; });
+tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; });
+
+A.precede(B, C);  // A runs before B and C
+D.succeed(B, C);  // D runs after  B and C
+
+executor.run(taskflow).wait();
+@endcode
+
+The taskflow object itself is NOT thread-safe. You should not
+modifying the graph while it is running,
+such as adding new tasks, adding new dependencies, and moving
+the taskflow to another.
+To minimize the overhead of task creation,
+our runtime leverages a global object pool to recycle
+tasks in a thread-safe manner.
+
+Please refer to @ref Cookbook to learn more about each task type
+and how to submit a taskflow to an executor.
+*/
+class Taskflow : public FlowBuilder {
+
+  friend class Topology;
+  friend class Executor;
+  friend class FlowBuilder;
+
+  struct Dumper {
+    size_t id;
+    std::stack<std::pair<const Node*, const Graph*>> stack;
+    std::unordered_map<const Graph*, size_t> visited;
+  };
+
+  public:
+
+    /**
+    @brief constructs a taskflow with the given name
+
+    @code{.cpp}
+    tf::Taskflow taskflow("My Taskflow");
+    std::cout << taskflow.name();         // "My Taskflow"
+    @endcode
+    */
+    Taskflow(const std::string& name);
+
+    /**
+    @brief constructs a taskflow
+    */
+    Taskflow();
+
+    /**
+    @brief constructs a taskflow from a moved taskflow
+
+    Constructing a taskflow @c taskflow1 from a moved taskflow @c taskflow2 will
+    migrate the graph of @c taskflow2 to @c taskflow1.
+    After the move, @c taskflow2 will become empty.
+
+    @code{.cpp}
+    tf::Taskflow taskflow1(std::move(taskflow2));
+    assert(taskflow2.empty());
+    @endcode
+
+    Notice that @c taskflow2 should not be running in an executor
+    during the move operation, or the behavior is undefined.
+    */
+    Taskflow(Taskflow&& rhs);
+
+    /**
+    @brief move assignment operator
+
+    Moving a taskflow @c taskflow2 to another taskflow @c taskflow1 will destroy
+    the existing graph of @c taskflow1 and assign it the graph of @c taskflow2.
+    After the move, @c taskflow2 will become empty.
+
+    @code{.cpp}
+    taskflow1 = std::move(taskflow2);
+    assert(taskflow2.empty());
+    @endcode
+
+    Notice that both @c taskflow1 and @c taskflow2 should not be running
+    in an executor during the move operation, or the behavior is undefined.
+    */
+    Taskflow& operator = (Taskflow&& rhs);
+
+    /**
+    @brief default destructor
+
+    When the destructor is called, all tasks and their associated data
+    (e.g., captured data) will be destroyed.
+    It is your responsibility to ensure all submitted execution of this
+    taskflow have completed before destroying it.
+    For instance, the following code results in undefined behavior
+    since the executor may still be running the taskflow while
+    it is destroyed after the block.
+
+    @code{.cpp}
+    {
+      tf::Taskflow taskflow;
+      executor.run(taskflow);
+    }
+    @endcode
+
+    To fix the problem, we must wait for the execution to complete
+    before destroying the taskflow.
+
+    @code{.cpp}
+    {
+      tf::Taskflow taskflow;
+      executor.run(taskflow).wait();
+    }
+    @endcode
+    */
+    ~Taskflow() = default;
+
+    /**
+    @brief dumps the taskflow to a DOT format through a std::ostream target
+
+    @code{.cpp}
+    taskflow.dump(std::cout);  // dump the graph to the standard output
+
+    std::ofstream ofs("output.dot");
+    taskflow.dump(ofs);        // dump the graph to the file output.dot
+    @endcode
+
+    For dynamically spawned tasks, such as module tasks, subflow tasks,
+    and GPU tasks, you need to run the taskflow first before you can
+    dump the entire graph.
+
+    @code{.cpp}
+    tf::Task parent = taskflow.emplace([](tf::Subflow sf){
+      sf.emplace([](){ std::cout << "child\n"; });
+    });
+    taskflow.dump(std::cout);      // this dumps only the parent tasks
+    executor.run(taskflow).wait();
+    taskflow.dump(std::cout);      // this dumps both parent and child tasks
+    @endcode
+    */
+    void dump(std::ostream& ostream) const;
+
+    /**
+    @brief dumps the taskflow to a std::string of DOT format
+
+    This method is similar to tf::Taskflow::dump(std::ostream& ostream),
+    but returning a string of the graph in DOT format.
+    */
+    std::string dump() const;
+
+    /**
+    @brief queries the number of tasks
+    */
+    size_t num_tasks() const;
+
+    /**
+    @brief queries the emptiness of the taskflow
+
+    An empty taskflow has no tasks. That is the return of
+    tf::Taskflow::num_tasks is zero.
+    */
+    bool empty() const;
+
+    /**
+    @brief assigns a name to the taskflow
+
+    @code{.cpp}
+    taskflow.name("assign another name");
+    @endcode
+    */
+    void name(const std::string&);
+
+    /**
+    @brief queries the name of the taskflow
+
+    @code{.cpp}
+    std::cout << "my name is: " << taskflow.name();
+    @endcode
+    */
+    const std::string& name() const;
+
+    /**
+    @brief clears the associated task dependency graph
+
+    When you clear a taskflow, all tasks and their associated data
+    (e.g., captured data in task callables) will be destroyed.
+    The behavior of clearing a running taskflow is undefined.
+    */
+    void clear();
+
+    /**
+    @brief applies a visitor to each task in the taskflow
+
+    A visitor is a callable that takes an argument of type tf::Task
+    and returns nothing. The following example iterates each task in a
+    taskflow and prints its name:
+
+    @code{.cpp}
+    taskflow.for_each_task([](tf::Task task){
+      std::cout << task.name() << '\n';
+    });
+    @endcode
+    */
+    template <typename V>
+    void for_each_task(V&& visitor) const;
+
+    /**
+    @brief removes dependencies that go from task @c from to task @c to
+
+    @param from from task (dependent)
+    @param to to task (successor)
+  
+    @code{.cpp}
+    tf::Taskflow taskflow;
+    auto a = taskflow.placeholder().name("a");
+    auto b = taskflow.placeholder().name("b");
+    auto c = taskflow.placeholder().name("c");
+    auto d = taskflow.placeholder().name("d");
+
+    a.precede(b, c, d);
+    assert(a.num_successors() == 3);
+    assert(b.num_dependents() == 1);
+    assert(c.num_dependents() == 1);
+    assert(d.num_dependents() == 1);
+  
+    taskflow.remove_dependency(a, b);
+    assert(a.num_successors() == 2);
+    assert(b.num_dependents() == 0);
+    @endcode
+    */
+    inline void remove_dependency(Task from, Task to);
+
+    /**
+    @brief returns a reference to the underlying graph object
+
+    A graph object (of type tf::Graph) is the ultimate storage for the
+    task dependency graph and should only be used as an opaque
+    data structure to interact with the executor (e.g., composition).
+    */
+    Graph& graph();
+
+  private:
+
+    mutable std::mutex _mutex;
+
+    std::string _name;
+
+    Graph _graph;
+
+    std::queue<std::shared_ptr<Topology>> _topologies;
+    std::optional<std::list<Taskflow>::iterator> _satellite;
+
+    void _dump(std::ostream&, const Graph*) const;
+    void _dump(std::ostream&, const Node*, Dumper&) const;
+    void _dump(std::ostream&, const Graph*, Dumper&) const;
+};
+
+// Constructor
+inline Taskflow::Taskflow(const std::string& name) :
+  FlowBuilder {_graph},
+  _name       {name} {
+}
+
+// Constructor
+inline Taskflow::Taskflow() : FlowBuilder{_graph} {
+}
+
+// Move constructor
+inline Taskflow::Taskflow(Taskflow&& rhs) : FlowBuilder{_graph} {
+
+  std::scoped_lock<std::mutex> lock(rhs._mutex);
+
+  _name = std::move(rhs._name);
+  _graph = std::move(rhs._graph);
+  _topologies = std::move(rhs._topologies);
+  _satellite = rhs._satellite;
+
+  rhs._satellite.reset();
+}
+
+// Move assignment
+inline Taskflow& Taskflow::operator = (Taskflow&& rhs) {
+  if(this != &rhs) {
+    std::scoped_lock<std::mutex, std::mutex> lock(_mutex, rhs._mutex);
+    _name = std::move(rhs._name);
+    _graph = std::move(rhs._graph);
+    _topologies = std::move(rhs._topologies);
+    _satellite = rhs._satellite;
+    rhs._satellite.reset();
+  }
+  return *this;
+}
+
+// Procedure:
+inline void Taskflow::clear() {
+  _graph._clear();
+}
+
+// Function: num_tasks
+inline size_t Taskflow::num_tasks() const {
+  return _graph.size();
+}
+
+// Function: empty
+inline bool Taskflow::empty() const {
+  return _graph.empty();
+}
+
+// Function: name
+inline void Taskflow::name(const std::string &name) {
+  _name = name;
+}
+
+// Function: name
+inline const std::string& Taskflow::name() const {
+  return _name;
+}
+
+// Function: graph
+inline Graph& Taskflow::graph() {
+  return _graph;
+}
+
+// Function: for_each_task
+template <typename V>
+void Taskflow::for_each_task(V&& visitor) const {
+  for(size_t i=0; i<_graph._nodes.size(); ++i) {
+    visitor(Task(_graph._nodes[i]));
+  }
+}
+
+// Procedure: remove_dependency
+inline void Taskflow::remove_dependency(Task from, Task to) {
+  from._node->_successors.erase(std::remove_if(
+    from._node->_successors.begin(), from._node->_successors.end(), [&](Node* i){
+      return i == to._node;
+    }
+  ), from._node->_successors.end());
+  
+  to._node->_dependents.erase(std::remove_if(
+    to._node->_dependents.begin(), to._node->_dependents.end(), [&](Node* i){
+      return i == from._node;
+    }
+  ), to._node->_dependents.end());
+}
+
+// Procedure: dump
+inline std::string Taskflow::dump() const {
+  std::ostringstream oss;
+  dump(oss);
+  return oss.str();
+}
+
+// Function: dump
+inline void Taskflow::dump(std::ostream& os) const {
+  os << "digraph Taskflow {\n";
+  _dump(os, &_graph);
+  os << "}\n";
+}
+
+// Procedure: _dump
+inline void Taskflow::_dump(std::ostream& os, const Graph* top) const {
+
+  Dumper dumper;
+
+  dumper.id = 0;
+  dumper.stack.push({nullptr, top});
+  dumper.visited[top] = dumper.id++;
+
+  while(!dumper.stack.empty()) {
+
+    auto [p, f] = dumper.stack.top();
+    dumper.stack.pop();
+
+    os << "subgraph cluster_p" << f << " {\nlabel=\"";
+
+    // n-level module
+    if(p) {
+      os << 'm' << dumper.visited[f];
+    }
+    // top-level taskflow graph
+    else {
+      os << "Taskflow: ";
+      if(_name.empty()) os << 'p' << this;
+      else os << _name;
+    }
+
+    os << "\";\n";
+
+    _dump(os, f, dumper);
+    os << "}\n";
+  }
+}
+
+// Procedure: _dump
+inline void Taskflow::_dump(
+  std::ostream& os, const Node* node, Dumper& dumper
+) const {
+
+  os << 'p' << node << "[label=\"";
+  if(node->_name.empty()) os << 'p' << node;
+  else os << node->_name;
+  os << "\" ";
+
+  // shape for node
+  switch(node->_handle.index()) {
+
+    case Node::CONDITION:
+    case Node::MULTI_CONDITION:
+      os << "shape=diamond color=black fillcolor=aquamarine style=filled";
+    break;
+
+    default:
+    break;
+  }
+
+  os << "];\n";
+
+  for(size_t s=0; s<node->_successors.size(); ++s) {
+    if(node->_is_conditioner()) {
+      // case edge is dashed
+      os << 'p' << node << " -> p" << node->_successors[s]
+         << " [style=dashed label=\"" << s << "\"];\n";
+    } else {
+      os << 'p' << node << " -> p" << node->_successors[s] << ";\n";
+    }
+  }
+
+  // subflow join node
+  if(node->_parent && node->_parent->_handle.index() == Node::DYNAMIC &&
+     node->_successors.size() == 0
+    ) {
+    os << 'p' << node << " -> p" << node->_parent << ";\n";
+  }
+
+  // node info
+  switch(node->_handle.index()) {
+
+    case Node::DYNAMIC: {
+      auto& sbg = std::get_if<Node::Dynamic>(&node->_handle)->subgraph;
+      if(!sbg.empty()) {
+        os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: ";
+        if(node->_name.empty()) os << 'p' << node;
+        else os << node->_name;
+
+        os << "\";\n" << "color=blue\n";
+        _dump(os, &sbg, dumper);
+        os << "}\n";
+      }
+    }
+    break;
+
+    default:
+    break;
+  }
+}
+
+// Procedure: _dump
+inline void Taskflow::_dump(
+  std::ostream& os, const Graph* graph, Dumper& dumper
+) const {
+
+  for(const auto& n : graph->_nodes) {
+
+    // regular task
+    if(n->_handle.index() != Node::MODULE) {
+      _dump(os, n, dumper);
+    }
+    // module task
+    else {
+      //auto module = &(std::get_if<Node::Module>(&n->_handle)->module);
+      auto module = &(std::get_if<Node::Module>(&n->_handle)->graph);
+
+      os << 'p' << n << "[shape=box3d, color=blue, label=\"";
+      if(n->_name.empty()) os << 'p' << n;
+      else os << n->_name;
+
+      if(dumper.visited.find(module) == dumper.visited.end()) {
+        dumper.visited[module] = dumper.id++;
+        dumper.stack.push({n, module});
+      }
+
+      os << " [m" << dumper.visited[module] << "]\"];\n";
+
+      for(const auto s : n->_successors) {
+        os << 'p' << n << "->" << 'p' << s << ";\n";
+      }
+    }
+  }
+}
+
+// ----------------------------------------------------------------------------
+// class definition: Future
+// ----------------------------------------------------------------------------
+
+/**
+@class Future
+
+@brief class to access the result of an execution
+
+tf::Future is a derived class from std::future that will eventually hold the
+execution result of a submitted taskflow (tf::Executor::run)
+In addition to the base methods inherited from std::future,
+you can call tf::Future::cancel to cancel the execution of the running taskflow
+associated with this future object.
+The following example cancels a submission of a taskflow that contains
+1000 tasks each running one second.
+
+@code{.cpp}
+tf::Executor executor;
+tf::Taskflow taskflow;
+
+for(int i=0; i<1000; i++) {
+  taskflow.emplace([](){
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+  });
+}
+
+// submit the taskflow
+tf::Future fu = executor.run(taskflow);
+
+// request to cancel the submitted execution above
+fu.cancel();
+
+// wait until the cancellation finishes
+fu.get();
+@endcode
+*/
+template <typename T>
+class Future : public std::future<T>  {
+
+  friend class Executor;
+  friend class Subflow;
+  friend class Runtime;
+
+  public:
+
+    /**
+    @brief default constructor
+    */
+    Future() = default;
+
+    /**
+    @brief disabled copy constructor
+    */
+    Future(const Future&) = delete;
+
+    /**
+    @brief default move constructor
+    */
+    Future(Future&&) = default;
+
+    /**
+    @brief disabled copy assignment
+    */
+    Future& operator = (const Future&) = delete;
+
+    /**
+    @brief default move assignment
+    */
+    Future& operator = (Future&&) = default;
+
+    /**
+    @brief cancels the execution of the running taskflow associated with
+           this future object
+
+    @return @c true if the execution can be cancelled or
+            @c false if the execution has already completed
+
+    When you request a cancellation, the executor will stop scheduling
+    any tasks onwards. Tasks that are already running will continue to finish
+    (non-preemptive).
+    You can call tf::Future::wait to wait for the cancellation to complete.
+    */
+    bool cancel();
+
+  private:
+    
+    std::weak_ptr<Topology> _topology;
+
+    Future(std::future<T>&&, std::weak_ptr<Topology> = std::weak_ptr<Topology>());
+};
+
+template <typename T>
+Future<T>::Future(std::future<T>&& f, std::weak_ptr<Topology> p) :
+  std::future<T> {std::move(f)},
+  _topology      {std::move(p)} {
+}
+
+// Function: cancel
+template <typename T>
+bool Future<T>::cancel() {
+  if(auto ptr = _topology.lock(); ptr) {
+    ptr->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed);
+    return true;
+  }
+  return false;
+}
+
+
+}  // end of namespace tf. ---------------------------------------------------
diff --git a/myxpcs/include/taskflow_/core/topology.hpp b/myxpcs/include/taskflow_/core/topology.hpp
new file mode 100644
index 0000000..068499d
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/topology.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+
+class TopologyBase {
+
+};
+
+// class: Topology
+class Topology {
+
+  friend class Executor;
+  friend class Runtime;
+  friend class Node;
+
+  template <typename T>
+  friend class Future;
+  
+  constexpr static int CLEAN = 0;
+  constexpr static int CANCELLED = 1;
+  constexpr static int EXCEPTION = 2;
+
+  public:
+
+    template <typename P, typename C>
+    Topology(Taskflow&, P&&, C&&);
+
+  private:
+
+    Taskflow& _taskflow;
+
+    std::promise<void> _promise;
+
+    SmallVector<Node*> _sources;
+
+    std::function<bool()> _pred;
+    std::function<void()> _call;
+
+    std::atomic<size_t> _join_counter {0};
+    std::atomic<int> _state {CLEAN};
+
+    std::exception_ptr _exception {nullptr};
+
+    void _carry_out_promise();
+};
+
+// Constructor
+template <typename P, typename C>
+Topology::Topology(Taskflow& tf, P&& p, C&& c):
+  _taskflow(tf),
+  _pred {std::forward<P>(p)},
+  _call {std::forward<C>(c)} {
+}
+
+// Procedure
+inline void Topology::_carry_out_promise() {
+  _exception ? _promise.set_exception(_exception) : _promise.set_value();
+}
+
+}  // end of namespace tf. ----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/core/tsq.hpp b/myxpcs/include/taskflow_/core/tsq.hpp
new file mode 100644
index 0000000..e4ea76c
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/tsq.hpp
@@ -0,0 +1,441 @@
+#pragma once
+
+#include "../utility/macros.hpp"
+#include "../utility/traits.hpp"
+
+/**
+@file tsq.hpp
+@brief task queue include file
+*/
+
+namespace tf {
+
+
+// ----------------------------------------------------------------------------
+// Task Types
+// ----------------------------------------------------------------------------
+
+/**
+@enum TaskPriority
+
+@brief enumeration of all task priority values
+
+A priority is an enumerated value of type @c unsigned.
+Currently, %Taskflow defines three priority levels, 
+@c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2.
+That is, the lower the value, the higher the priority.
+
+*/
+enum class TaskPriority : unsigned {
+  /** @brief value of the highest priority (i.e., 0)  */
+  HIGH = 0,
+  /** @brief value of the normal priority (i.e., 1)  */
+  NORMAL = 1,
+  /** @brief value of the lowest priority (i.e., 2) */
+  LOW = 2,
+  /** @brief conventional value for iterating priority values */
+  MAX = 3
+};
+
+
+
+// ----------------------------------------------------------------------------
+// Task Queue
+// ----------------------------------------------------------------------------
+
+
+/**
+@class: TaskQueue
+
+@tparam T data type (must be a pointer type)
+@tparam TF_MAX_PRIORITY maximum level of the priority 
+
+@brief class to create a lock-free unbounded single-producer multiple-consumer queue
+
+This class implements the work-stealing queue described in the paper,
+<a href="https://www.di.ens.fr/~zappa/readings/ppopp13.pdf">Correct and Efficient Work-Stealing for Weak Memory Models</a>,
+and extends it to include priority.
+
+Only the queue owner can perform pop and push operations,
+while others can steal data from the queue simultaneously.
+Priority starts from zero (highest priority) to the template value 
+`TF_MAX_PRIORITY-1` (lowest priority).
+All operations are associated with priority values to indicate
+the corresponding queues to which an operation is applied.
+
+The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` 
+which applies only three priority levels to the task queue.
+
+@code{.cpp}
+auto [A, B, C, D, E] = taskflow.emplace(
+  [] () { },
+  [&] () { 
+    std::cout << "Task B: " << counter++ << '\n';  // 0
+  },
+  [&] () { 
+    std::cout << "Task C: " << counter++ << '\n';  // 2
+  },
+  [&] () { 
+    std::cout << "Task D: " << counter++ << '\n';  // 1
+  },
+  [] () { }
+);
+
+A.precede(B, C, D); 
+E.succeed(B, C, D);
+  
+B.priority(tf::TaskPriority::HIGH);
+C.priority(tf::TaskPriority::LOW);
+D.priority(tf::TaskPriority::NORMAL);
+  
+executor.run(taskflow).wait();
+@endcode
+
+In the above example, we have a task graph of five tasks,
+@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D
+can run in simultaneously when @c A finishes.
+Since we only uses one worker thread in the executor, 
+we can deterministically run @c B first, then @c D, and @c C
+in order of their priority values.
+The output is as follows:
+
+@code{.shell-session}
+Task B: 0
+Task D: 1
+Task C: 2
+@endcode
+
+*/
+template <typename T, unsigned TF_MAX_PRIORITY = static_cast<unsigned>(TaskPriority::MAX)>
+class TaskQueue {
+  
+  static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one");
+  static_assert(std::is_pointer_v<T>, "T must be a pointer type");
+
+  struct Array {
+
+    int64_t C;
+    int64_t M;
+    std::atomic<T>* S;
+
+    explicit Array(int64_t c) :
+      C {c},
+      M {c-1},
+      S {new std::atomic<T>[static_cast<size_t>(C)]} {
+    }
+
+    ~Array() {
+      delete [] S;
+    }
+
+    int64_t capacity() const noexcept {
+      return C;
+    }
+
+    void push(int64_t i, T o) noexcept {
+      S[i & M].store(o, std::memory_order_relaxed);
+    }
+
+    T pop(int64_t i) noexcept {
+      return S[i & M].load(std::memory_order_relaxed);
+    }
+
+    Array* resize(int64_t b, int64_t t) {
+      Array* ptr = new Array {2*C};
+      for(int64_t i=t; i!=b; ++i) {
+        ptr->push(i, pop(i));
+      }
+      return ptr;
+    }
+
+  };
+
+  // Doubling the alignment by 2 seems to generate the most
+  // decent performance.
+  CachelineAligned<std::atomic<int64_t>> _top[TF_MAX_PRIORITY];
+  CachelineAligned<std::atomic<int64_t>> _bottom[TF_MAX_PRIORITY];
+  std::atomic<Array*> _array[TF_MAX_PRIORITY];
+  std::vector<Array*> _garbage[TF_MAX_PRIORITY];
+
+  //std::atomic<T> _cache {nullptr};
+
+  public:
+
+    /**
+    @brief constructs the queue with a given capacity
+
+    @param capacity the capacity of the queue (must be power of 2)
+    */
+    explicit TaskQueue(int64_t capacity = 512);
+
+    /**
+    @brief destructs the queue
+    */
+    ~TaskQueue();
+
+    /**
+    @brief queries if the queue is empty at the time of this call
+    */
+    bool empty() const noexcept;
+
+    /**
+    @brief queries if the queue is empty at a specific priority value
+    */
+    bool empty(unsigned priority) const noexcept;
+
+    /**
+    @brief queries the number of items at the time of this call
+    */
+    size_t size() const noexcept;
+
+    /**
+    @brief queries the number of items with the given priority
+           at the time of this call
+    */
+    size_t size(unsigned priority) const noexcept;
+
+    /**
+    @brief queries the capacity of the queue
+    */
+    int64_t capacity() const noexcept;
+    
+    /**
+    @brief queries the capacity of the queue at a specific priority value
+    */
+    int64_t capacity(unsigned priority) const noexcept;
+
+    /**
+    @brief inserts an item to the queue
+
+    @param item the item to push to the queue
+    @param priority priority value of the item to push (default = 0)
+    
+    Only the owner thread can insert an item to the queue.
+    The operation can trigger the queue to resize its capacity
+    if more space is required.
+    */
+    TF_FORCE_INLINE void push(T item, unsigned priority);
+
+    /**
+    @brief pops out an item from the queue
+
+    Only the owner thread can pop out an item from the queue.
+    The return can be a @c nullptr if this operation failed (empty queue).
+    */
+    T pop();
+
+    /**
+    @brief pops out an item with a specific priority value from the queue
+
+    @param priority priority of the item to pop
+
+    Only the owner thread can pop out an item from the queue.
+    The return can be a @c nullptr if this operation failed (empty queue).
+    */
+    TF_FORCE_INLINE T pop(unsigned priority);
+
+    /**
+    @brief steals an item from the queue
+
+    Any threads can try to steal an item from the queue.
+    The return can be a @c nullptr if this operation failed (not necessary empty).
+    */
+    T steal();
+
+    /**
+    @brief steals an item with a specific priority value from the queue
+
+    @param priority priority of the item to steal
+
+    Any threads can try to steal an item from the queue.
+    The return can be a @c nullptr if this operation failed (not necessary empty).
+    */
+    T steal(unsigned priority);
+
+  private:
+    TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t);
+};
+
+// Constructor
+template <typename T, unsigned TF_MAX_PRIORITY>
+TaskQueue<T, TF_MAX_PRIORITY>::TaskQueue(int64_t c) {
+  assert(c && (!(c & (c-1))));
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
+    _top[p].data.store(0, std::memory_order_relaxed);
+    _bottom[p].data.store(0, std::memory_order_relaxed);
+    _array[p].store(new Array{c}, std::memory_order_relaxed);
+    _garbage[p].reserve(32);
+  });
+}
+
+// Destructor
+template <typename T, unsigned TF_MAX_PRIORITY>
+TaskQueue<T, TF_MAX_PRIORITY>::~TaskQueue() {
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
+    for(auto a : _garbage[p]) {
+      delete a;
+    }
+    delete _array[p].load();
+  });
+}
+
+// Function: empty
+template <typename T, unsigned TF_MAX_PRIORITY>
+bool TaskQueue<T, TF_MAX_PRIORITY>::empty() const noexcept {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(!empty(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Function: empty
+template <typename T, unsigned TF_MAX_PRIORITY>
+bool TaskQueue<T, TF_MAX_PRIORITY>::empty(unsigned p) const noexcept {
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
+  return (b <= t);
+}
+
+// Function: size
+template <typename T, unsigned TF_MAX_PRIORITY>
+size_t TaskQueue<T, TF_MAX_PRIORITY>::size() const noexcept {
+  size_t s;
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); });
+  return s;
+}
+
+// Function: size
+template <typename T, unsigned TF_MAX_PRIORITY>
+size_t TaskQueue<T, TF_MAX_PRIORITY>::size(unsigned p) const noexcept {
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
+  return static_cast<size_t>(b >= t ? b - t : 0);
+}
+
+// Function: push
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_FORCE_INLINE void TaskQueue<T, TF_MAX_PRIORITY>::push(T o, unsigned p) {
+
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_acquire);
+  Array* a = _array[p].load(std::memory_order_relaxed);
+
+  // queue is full
+  if(a->capacity() - 1 < (b - t)) {
+    a = resize_array(a, p, b, t);
+  }
+
+  a->push(b, o);
+  std::atomic_thread_fence(std::memory_order_release);
+  _bottom[p].data.store(b + 1, std::memory_order_relaxed);
+}
+
+// Function: pop
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::pop() {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(auto t = pop(i); t) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
+// Function: pop
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_FORCE_INLINE T TaskQueue<T, TF_MAX_PRIORITY>::pop(unsigned p) {
+
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1;
+  Array* a = _array[p].load(std::memory_order_relaxed);
+  _bottom[p].data.store(b, std::memory_order_relaxed);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
+
+  T item {nullptr};
+
+  if(t <= b) {
+    item = a->pop(b);
+    if(t == b) {
+      // the last item just got stolen
+      if(!_top[p].data.compare_exchange_strong(t, t+1,
+                                               std::memory_order_seq_cst,
+                                               std::memory_order_relaxed)) {
+        item = nullptr;
+      }
+      _bottom[p].data.store(b + 1, std::memory_order_relaxed);
+    }
+  }
+  else {
+    _bottom[p].data.store(b + 1, std::memory_order_relaxed);
+  }
+
+  return item;
+}
+
+// Function: steal
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::steal() {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(auto t = steal(i); t) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
+// Function: steal
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::steal(unsigned p) {
+  
+  int64_t t = _top[p].data.load(std::memory_order_acquire);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  int64_t b = _bottom[p].data.load(std::memory_order_acquire);
+
+  T item {nullptr};
+
+  if(t < b) {
+    Array* a = _array[p].load(std::memory_order_consume);
+    item = a->pop(t);
+    if(!_top[p].data.compare_exchange_strong(t, t+1,
+                                             std::memory_order_seq_cst,
+                                             std::memory_order_relaxed)) {
+      return nullptr;
+    }
+  }
+
+  return item;
+}
+
+// Function: capacity
+template <typename T, unsigned TF_MAX_PRIORITY>
+int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity() const noexcept {
+  size_t s;
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { 
+    s = i ? capacity(i) + s : capacity(i); 
+  });
+  return s;
+}
+
+// Function: capacity
+template <typename T, unsigned TF_MAX_PRIORITY>
+int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity(unsigned p) const noexcept {
+  return _array[p].load(std::memory_order_relaxed)->capacity();
+}
+
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_NO_INLINE typename TaskQueue<T, TF_MAX_PRIORITY>::Array*
+  TaskQueue<T, TF_MAX_PRIORITY>::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) {
+
+  Array* tmp = a->resize(b, t);
+  _garbage[p].push_back(a);
+  std::swap(a, tmp);
+  _array[p].store(a, std::memory_order_release);
+  // Note: the original paper using relaxed causes t-san to complain
+  //_array.store(a, std::memory_order_relaxed);
+  return a;
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/core/worker.hpp b/myxpcs/include/taskflow_/core/worker.hpp
new file mode 100644
index 0000000..8f86381
--- /dev/null
+++ b/myxpcs/include/taskflow_/core/worker.hpp
@@ -0,0 +1,172 @@
+#pragma once
+
+#include "declarations.hpp"
+#include "tsq.hpp"
+#include "notifier.hpp"
+
+/**
+@file worker.hpp
+@brief worker include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Class Definition: Worker
+// ----------------------------------------------------------------------------
+
+/**
+@class Worker
+
+@brief class to create a worker in an executor
+
+The class is primarily used by the executor to perform work-stealing algorithm.
+Users can access a worker object and alter its property
+(e.g., changing the thread affinity in a POSIX-like system)
+using tf::WorkerInterface.
+*/
+class Worker {
+
+  friend class Executor;
+  friend class WorkerView;
+
+  public:
+
+    /**
+    @brief queries the worker id associated with its parent executor
+
+    A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
+    where @c N is the number of workers spawned at the construction
+    time of the executor.
+    */
+    inline size_t id() const { return _id; }
+
+    /**
+    @brief acquires a pointer access to the underlying thread
+    */
+    inline std::thread* thread() const { return _thread; }
+
+    /**
+    @brief queries the size of the queue (i.e., number of enqueued tasks to
+           run) associated with the worker
+    */
+    inline size_t queue_size() const { return _wsq.size(); }
+    
+    /**
+    @brief queries the current capacity of the queue
+    */
+    inline size_t queue_capacity() const { return static_cast<size_t>(_wsq.capacity()); }
+
+  private:
+
+    size_t _id;
+    size_t _vtm;
+    Executor* _executor;
+    std::thread* _thread;
+    Notifier::Waiter* _waiter;
+    std::default_random_engine _rdgen { std::random_device{}() };
+    TaskQueue<Node*> _wsq;
+    Node* _cache;
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: PerThreadWorker
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+//struct PerThreadWorker {
+//
+//  Worker* worker;
+//
+//  PerThreadWorker() : worker {nullptr} {}
+//
+//  PerThreadWorker(const PerThreadWorker&) = delete;
+//  PerThreadWorker(PerThreadWorker&&) = delete;
+//
+//  PerThreadWorker& operator = (const PerThreadWorker&) = delete;
+//  PerThreadWorker& operator = (PerThreadWorker&&) = delete;
+//};
+
+/**
+@private
+*/
+//inline PerThreadWorker& this_worker() {
+//  thread_local PerThreadWorker worker;
+//  return worker;
+//}
+
+
+// ----------------------------------------------------------------------------
+// Class Definition: WorkerView
+// ----------------------------------------------------------------------------
+
+/**
+@class WorkerView
+
+@brief class to create an immutable view of a worker in an executor
+
+An executor keeps a set of internal worker threads to run tasks.
+A worker view provides users an immutable interface to observe
+when a worker runs a task, and the view object is only accessible
+from an observer derived from tf::ObserverInterface.
+*/
+class WorkerView {
+
+  friend class Executor;
+
+  public:
+
+    /**
+    @brief queries the worker id associated with its parent executor
+
+    A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
+    where @c N is the number of workers spawned at the construction
+    time of the executor.
+    */
+    size_t id() const;
+
+    /**
+    @brief queries the size of the queue (i.e., number of pending tasks to
+           run) associated with the worker
+    */
+    size_t queue_size() const;
+
+    /**
+    @brief queries the current capacity of the queue
+    */
+    size_t queue_capacity() const;
+
+  private:
+
+    WorkerView(const Worker&);
+    WorkerView(const WorkerView&) = default;
+
+    const Worker& _worker;
+
+};
+
+// Constructor
+inline WorkerView::WorkerView(const Worker& w) : _worker{w} {
+}
+
+// function: id
+inline size_t WorkerView::id() const {
+  return _worker._id;
+}
+
+// Function: queue_size
+inline size_t WorkerView::queue_size() const {
+  return _worker._wsq.size();
+}
+
+// Function: queue_capacity
+inline size_t WorkerView::queue_capacity() const {
+  return static_cast<size_t>(_worker._wsq.capacity());
+}
+
+
+}  // end of namespact tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/find.hpp b/myxpcs/include/taskflow_/cuda/algorithm/find.hpp
new file mode 100644
index 0000000..f344666
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/find.hpp
@@ -0,0 +1,294 @@
+#pragma once
+
+#include "for_each.hpp"
+#include "reduce.hpp"
+
+/**
+@file taskflow/cuda/algorithm/find.hpp
+@brief cuda find algorithms include file
+*/
+
+namespace tf::detail {
+
+/** @private */
+template <typename T>
+struct cudaFindPair {
+
+  T key;
+  unsigned index;
+
+  __device__ operator unsigned () const { return index; }
+};
+
+/** @private */
+template <typename P, typename I, typename U>
+void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
+
+  if(count == 0) {
+    cuda_single_task(p, [=] __device__ () { *idx = 0; });
+    return;
+  }
+
+  using E = std::decay_t<P>;
+
+  auto B = (count + E::nv - 1) / E::nv;
+
+  // set the index to the maximum
+  cuda_single_task(p, [=] __device__ () { *idx = count; });
+
+  // launch the kernel to atomic-find the minimum
+  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+    __shared__ unsigned shm_id;
+
+    if(!tid) {
+      shm_id = count;
+    }
+
+    __syncthreads();
+
+    auto tile = cuda_get_tile(bid, E::nv, count);
+
+    auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
+      input + tile.begin, tid, tile.count()
+    );
+
+    auto id = count;
+
+    for(unsigned i=0; i<E::vt; i++) {
+      auto j = E::nt*i + tid;
+      if(j < tile.count() && pred(x[i])) {
+        id = j + tile.begin;
+        break;
+      }
+    }
+
+    // Note: the reduce version is not faster though
+    // reduce to a scalar per block.
+    //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
+
+    //id = cudaBlockReduce<E::nt, unsigned>()(
+    //  tid,
+    //  id,
+    //  shm,
+    //  (tile.count() < E::nt ? tile.count() : E::nt),
+    //  cuda_minimum<unsigned>{},
+    //  false
+    //);
+
+    // only need the minimum id
+    atomicMin(&shm_id, id);
+    __syncthreads();
+
+    // reduce all to the global memory
+    if(!tid) {
+      atomicMin(idx, shm_id);
+      //atomicMin(idx, id);
+    }
+  });
+}
+
+/** @private */
+template <typename P, typename I, typename O>
+void cuda_min_element_loop(
+  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
+) {
+
+  if(count == 0) {
+    cuda_single_task(p, [=] __device__ () { *idx = 0; });
+    return;
+  }
+
+  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
+
+  cuda_uninitialized_reduce_loop(p,
+    cuda_make_load_iterator<T>([=]__device__(auto i){
+      return T{*(input+i), i};
+    }),
+    count,
+    idx,
+    [=] __device__ (const auto& a, const auto& b) {
+      return op(a.key, b.key) ? a : b;
+    },
+    ptr
+  );
+}
+
+/** @private */
+template <typename P, typename I, typename O>
+void cuda_max_element_loop(
+  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
+) {
+
+  if(count == 0) {
+    cuda_single_task(p, [=] __device__ () { *idx = 0; });
+    return;
+  }
+
+  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
+
+  cuda_uninitialized_reduce_loop(p,
+    cuda_make_load_iterator<T>([=]__device__(auto i){
+      return T{*(input+i), i};
+    }),
+    count,
+    idx,
+    [=] __device__ (const auto& a, const auto& b) {
+      return op(a.key, b.key) ? b : a;
+    },
+    ptr
+  );
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+namespace tf {
+
+
+// ----------------------------------------------------------------------------
+// cuda_find_if
+// ----------------------------------------------------------------------------
+
+/**
+@brief finds the index of the first element that satisfies the given criteria
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam U unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param idx pointer to the index of the found element
+@param op unary operator which returns @c true for the required element
+
+The function launches kernels asynchronously to find the index @c idx of the
+first element in the range <tt>[first, last)</tt>
+such that <tt>op(*(first+idx))</tt> is true.
+This is equivalent to the parallel execution of the following loop:
+
+@code{.cpp}
+unsigned idx = 0;
+for(; first != last; ++first, ++idx) {
+  if (p(*first)) {
+    return idx;
+  }
+}
+return idx;
+@endcode
+*/
+template <typename P, typename I, typename U>
+void cuda_find_if(
+  P&& p, I first, I last, unsigned* idx, U op
+) {
+  detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
+}
+
+// ----------------------------------------------------------------------------
+// cuda_min_element
+// ----------------------------------------------------------------------------
+
+// Function: min-element_bufsz
+template <unsigned NT, unsigned VT>  
+template <typename T>
+unsigned cudaExecutionPolicy<NT, VT>::min_element_bufsz(unsigned count) {
+  return reduce_bufsz<detail::cudaFindPair<T>>(count);
+}
+
+/**
+@brief finds the index of the minimum element in a range
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam O comparator type
+
+@param p execution policy object
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param idx solution index of the minimum element
+@param op comparison function object
+@param buf pointer to the buffer
+
+The function launches kernels asynchronously to find
+the smallest element in the range <tt>[first, last)</tt>
+using the given comparator @c op.
+You need to provide a buffer that holds at least
+tf::cuda_min_element_bufsz bytes for internal use.
+The function is equivalent to a parallel execution of the following loop:
+
+@code{.cpp}
+if(first == last) {
+  return 0;
+}
+auto smallest = first;
+for (++first; first != last; ++first) {
+  if (op(*first, *smallest)) {
+    smallest = first;
+  }
+}
+return std::distance(first, smallest);
+@endcode
+*/
+template <typename P, typename I, typename O>
+void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
+  detail::cuda_min_element_loop(
+    p, first, std::distance(first, last), idx, op, buf
+  );
+}
+
+// ----------------------------------------------------------------------------
+// cuda_max_element
+// ----------------------------------------------------------------------------
+
+// Function: max_element_bufsz
+template <unsigned NT, unsigned VT>  
+template <typename T>
+unsigned cudaExecutionPolicy<NT, VT>::max_element_bufsz(unsigned count) {
+  return reduce_bufsz<detail::cudaFindPair<T>>(count);
+}
+
+/**
+@brief finds the index of the maximum element in a range
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam O comparator type
+
+@param p execution policy object
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param idx solution index of the maximum element
+@param op comparison function object
+@param buf pointer to the buffer
+
+The function launches kernels asynchronously to find
+the largest element in the range <tt>[first, last)</tt>
+using the given comparator @c op.
+You need to provide a buffer that holds at least
+tf::cuda_max_element_bufsz bytes for internal use.
+The function is equivalent to a parallel execution of the following loop:
+
+@code{.cpp}
+if(first == last) {
+  return 0;
+}
+auto largest = first;
+for (++first; first != last; ++first) {
+  if (op(*largest, *first)) {
+    largest = first;
+  }
+}
+return std::distance(first, largest);
+@endcode
+*/
+template <typename P, typename I, typename O>
+void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
+  detail::cuda_max_element_loop(
+    p, first, std::distance(first, last), idx, op, buf
+  );
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp b/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp
new file mode 100644
index 0000000..38a6f85
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp
@@ -0,0 +1,315 @@
+#pragma once
+
+#include "../cudaflow.hpp"
+
+/**
+@file taskflow/cuda/algorithm/for_each.hpp
+@brief cuda parallel-iteration algorithms include file
+*/
+
+namespace tf {
+
+namespace detail {
+
+/**
+@private
+*/
+template <size_t nt, size_t vt, typename I, typename C>
+__global__ void cuda_for_each_kernel(I first, unsigned count, C c) {
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  cuda_strided_iterate<nt, vt>(
+    [=](auto, auto j) {
+      c(*(first + tile.begin + j));
+    }, 
+    tid, tile.count()
+  );
+}
+
+/** @private */
+template <size_t nt, size_t vt, typename I, typename C>
+__global__ void cuda_for_each_index_kernel(I first, I inc, unsigned count, C c) {
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  cuda_strided_iterate<nt, vt>(
+    [=]__device__(auto, auto j) {
+      c(first + inc*(tile.begin+j));
+    }, 
+    tid, tile.count()
+  );
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// cuda standard algorithms: single_task/for_each/for_each_index
+// ----------------------------------------------------------------------------
+
+/**
+@brief runs a callable asynchronously using one kernel thread
+
+@tparam P execution policy type
+@tparam C closure type
+
+@param p execution policy
+@param c closure to run by one kernel thread
+
+The function launches a single kernel thread to run the given callable
+through the stream in the execution policy object.
+*/
+template <typename P, typename C>
+void cuda_single_task(P&& p, C c) {
+  cuda_kernel<<<1, 1, 0, p.stream()>>>(
+    [=]__device__(auto, auto) mutable { c(); }
+  );
+}
+
+/**
+@brief performs asynchronous parallel iterations over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam C unary operator type
+
+@param p execution policy object
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param c unary operator to apply to each dereferenced iterator
+
+This function is equivalent to a parallel execution of the following loop
+on a GPU:
+
+@code{.cpp}
+for(auto itr = first; itr != last; itr++) {
+  c(*itr);
+}
+@endcode
+*/
+template <typename P, typename I, typename C>
+void cuda_for_each(P&& p, I first, I last, C c) {
+  
+  using E = std::decay_t<P>;
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  detail::cuda_for_each_kernel<E::nt, E::vt, I, C><<<E::num_blocks(count), E::nt, 0, p.stream()>>>(
+    first, count, c
+  );
+}
+
+/**
+@brief performs asynchronous parallel iterations over
+       an index-based range of items
+
+@tparam P execution policy type
+@tparam I input index type
+@tparam C unary operator type
+
+@param p execution policy object
+@param first index to the beginning of the range
+@param last  index to the end of the range
+@param inc step size between successive iterations
+@param c unary operator to apply to each index
+
+This function is equivalent to a parallel execution of
+the following loop on a GPU:
+
+@code{.cpp}
+// step is positive [first, last)
+for(auto i=first; i<last; i+=step) {
+  c(i);
+}
+
+// step is negative [first, last)
+for(auto i=first; i>last; i+=step) {
+  c(i);
+}
+@endcode
+*/
+template <typename P, typename I, typename C>
+void cuda_for_each_index(P&& p, I first, I last, I inc, C c) {
+  
+  using E = std::decay_t<P>;
+
+  unsigned count = distance(first, last, inc);
+
+  if(count == 0) {
+    return;
+  }
+
+  detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C><<<E::num_blocks(count), E::nt, 0, p.stream()>>>(
+    first, inc, count, c
+  );
+}
+
+// ----------------------------------------------------------------------------
+// single_task
+// ----------------------------------------------------------------------------
+
+/** @private */
+template <typename C>
+__global__ void cuda_single_task(C callable) {
+  callable();
+}
+
+// Function: single_task
+template <typename C>
+cudaTask cudaFlow::single_task(C c) {
+  return kernel(1, 1, 0, cuda_single_task<C>, c);
+}
+
+// Function: single_task
+template <typename C>
+void cudaFlow::single_task(cudaTask task, C c) {
+  return kernel(task, 1, 1, 0, cuda_single_task<C>, c);
+}
+
+// Function: single_task
+template <typename C>
+cudaTask cudaFlowCapturer::single_task(C callable) {
+  return on([=] (cudaStream_t stream) mutable {
+    cuda_single_task(cudaDefaultExecutionPolicy(stream), callable);
+  });
+}
+
+// Function: single_task
+template <typename C>
+void cudaFlowCapturer::single_task(cudaTask task, C callable) {
+  on(task, [=] (cudaStream_t stream) mutable {
+    cuda_single_task(cudaDefaultExecutionPolicy(stream), callable);
+  });
+}
+
+// ----------------------------------------------------------------------------
+// cudaFlow: for_each, for_each_index
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename I, typename C>
+cudaTask cudaFlow::for_each(I first, I last, C c) {
+
+  using E = cudaDefaultExecutionPolicy;
+  
+  unsigned count = std::distance(first, last);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  return kernel(
+    E::num_blocks(count), E::nt, 0, 
+    detail::cuda_for_each_kernel<E::nt, E::vt, I, C>, first, count, c
+  );
+}
+
+// Function: for_each
+template <typename I, typename C>
+void cudaFlow::for_each(cudaTask task, I first, I last, C c) {
+
+  using E = cudaDefaultExecutionPolicy;
+  
+  unsigned count = std::distance(first, last);
+
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+  
+  kernel(task, 
+    E::num_blocks(count), E::nt, 0, 
+    detail::cuda_for_each_kernel<E::nt, E::vt, I, C>, first, count, c
+  );
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+cudaTask cudaFlow::for_each_index(I first, I last, I inc, C c) {
+
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = distance(first, last, inc);
+
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  return kernel(
+    E::num_blocks(count), E::nt, 0, 
+    detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C>, first, inc, count, c
+  );
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+void cudaFlow::for_each_index(cudaTask task, I first, I last, I inc, C c) {
+  
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = distance(first, last, inc);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  return kernel(task,
+    E::num_blocks(count), E::nt, 0, 
+    detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C>, first, inc, count, c
+  );
+}
+
+// ----------------------------------------------------------------------------
+// cudaFlowCapturer: for_each, for_each_index
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename I, typename C>
+cudaTask cudaFlowCapturer::for_each(I first, I last, C c) {
+  return on([=](cudaStream_t stream) mutable {
+    cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c);
+  });
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+cudaTask cudaFlowCapturer::for_each_index(I beg, I end, I inc, C c) {
+  return on([=] (cudaStream_t stream) mutable {
+    cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c);
+  });
+}
+
+// Function: for_each
+template <typename I, typename C>
+void cudaFlowCapturer::for_each(cudaTask task, I first, I last, C c) {
+  on(task, [=](cudaStream_t stream) mutable {
+    cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c);
+  });
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+void cudaFlowCapturer::for_each_index(
+  cudaTask task, I beg, I end, I inc, C c
+) {
+  on(task, [=] (cudaStream_t stream) mutable {
+    cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c);
+  });
+}
+
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp b/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp
new file mode 100644
index 0000000..d0f6620
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "../cudaflow.hpp"
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// row-major matrix multiplication
+// ----------------------------------------------------------------------------
+
+template <typename T>
+__global__ void cuda_matmul(
+  const T* A,
+  const T* B,
+  T* C,
+  size_t M,
+  size_t K,
+  size_t N
+) {
+  __shared__ T A_tile[32][32];
+  __shared__ T B_tile[32][32];
+
+  size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  T res = 0;
+
+  for(size_t k = 0; k < K; k += 32) {
+    if((threadIdx.x + k) < K && y < M) {
+      A_tile[threadIdx.y][threadIdx.x] = A[y * K + threadIdx.x + k];
+    }
+    else{
+      A_tile[threadIdx.y][threadIdx.x] = 0;
+    }
+
+    if((threadIdx.y + k) < K && x < N) {
+      B_tile[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k) * N + x];
+    }
+    else{
+      B_tile[threadIdx.y][threadIdx.x] = 0;
+    }
+
+    __syncthreads();
+
+    for(size_t i = 0; i < 32; ++i) {
+      res += A_tile[threadIdx.y][i] * B_tile[i][threadIdx.x];
+    }
+    __syncthreads();
+  }
+
+  if(x < N && y < M) {
+    C[y * N + x] = res;
+  }
+
+}
+
+} // end of namespace tf ---------------------------------------------------------
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp b/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp
new file mode 100644
index 0000000..d325491
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp
@@ -0,0 +1,585 @@
+#pragma once
+
+#include "../cudaflow.hpp"
+
+/**
+@file taskflow/cuda/algorithm/merge.hpp
+@brief CUDA merge algorithm include file
+*/
+
+namespace tf::detail {
+
+/**
+@private
+@brief merge bound type
+*/
+enum class cudaMergeBoundType {
+  LOWER,
+  UPPER
+};
+
+/** @private */
+template<typename T, unsigned N>
+struct cudaMergePair {
+  cudaArray<T, N> keys;
+  cudaArray<unsigned, N> indices;
+};
+
+/** @private */
+struct cudaMergeRange {
+  unsigned a_begin, a_end, b_begin, b_end;
+
+  __device__ unsigned a_count() const { return a_end - a_begin; }
+  __device__ unsigned b_count() const { return b_end - b_begin; }
+  __device__ unsigned total() const { return a_count() + b_count(); }
+
+  __device__ cudaRange a_range() const {
+    return cudaRange { a_begin, a_end };
+  }
+  __device__ cudaRange b_range() const {
+    return cudaRange { b_begin, b_end };
+  }
+
+  __device__ cudaMergeRange to_local() const {
+    return cudaMergeRange { 0, a_count(), a_count(), total() };
+  }
+
+  // Partition from mp to the end.
+  __device__ cudaMergeRange partition(unsigned mp0, unsigned diag) const {
+    return cudaMergeRange { a_begin + mp0, a_end, b_begin + diag - mp0, b_end };
+  }
+
+  // Partition from mp0 to mp1.
+  __device__ cudaMergeRange partition(unsigned mp0, unsigned diag0,
+    unsigned mp1, unsigned diag1) const {
+    return cudaMergeRange {
+      a_begin + mp0,
+      a_begin + mp1,
+      b_begin + diag0 - mp0,
+      b_begin + diag1 - mp1
+    };
+  }
+
+  __device__ bool a_valid() const {
+    return a_begin < a_end;
+  }
+
+  __device__ bool b_valid() const {
+    return b_begin < b_end;
+  }
+};
+
+/** @private */
+template<
+  cudaMergeBoundType bounds = cudaMergeBoundType::LOWER,
+  typename a_keys_it, typename b_keys_it, typename comp_t
+>
+__device__ auto cuda_merge_path(
+  a_keys_it a_keys, unsigned a_count,
+  b_keys_it b_keys, unsigned b_count,
+  unsigned diag, comp_t comp
+) {
+
+  unsigned beg = (diag > b_count) ? diag - b_count : 0;
+  unsigned end = diag < a_count ? diag : a_count;
+
+  while(beg < end) {
+    auto mid = (beg + end) / 2;
+    auto a_key = a_keys[mid];
+    auto b_key = b_keys[diag - 1 - mid];
+    bool pred = (cudaMergeBoundType::UPPER == bounds) ?
+      comp(a_key, b_key) :
+      !comp(b_key, a_key);
+
+    if(pred) beg = mid + 1;
+    else end = mid;
+  }
+  return beg;
+}
+
+/** @private */
+template<cudaMergeBoundType bounds, typename keys_it, typename comp_t>
+__device__ auto cuda_merge_path(
+  keys_it keys, cudaMergeRange range, unsigned diag, comp_t comp
+) {
+
+  return cuda_merge_path<bounds>(
+    keys + range.a_begin, range.a_count(),
+    keys + range.b_begin, range.b_count(),
+    diag, comp);
+}
+
+/** @private */
+template<cudaMergeBoundType bounds, bool range_check, typename T, typename comp_t>
+__device__ bool cuda_merge_predicate(
+  T a_key, T b_key, cudaMergeRange range, comp_t comp
+) {
+
+  bool p;
+  if(range_check && !range.a_valid()) {
+    p = false;
+  }
+  else if(range_check && !range.b_valid()) {
+    p = true;
+  }
+  else {
+    p = (cudaMergeBoundType::UPPER == bounds) ? comp(a_key, b_key) :
+                                               !comp(b_key, a_key);
+  }
+  return p;
+}
+
+/** @private */
+inline __device__ auto cuda_compute_merge_range(
+  unsigned a_count, unsigned b_count,
+  unsigned partition, unsigned spacing,
+  unsigned mp0, unsigned mp1
+) {
+
+  auto diag0 = spacing * partition;
+  auto diag1 = min(a_count + b_count, diag0 + spacing);
+
+  return cudaMergeRange { mp0, mp1, diag0 - mp0, diag1 - mp1 };
+}
+
+/**
+@private
+
+Specialization that emits just one LD instruction. Can only reliably used
+with raw pointer types. Fixed not to use pointer arithmetic so that
+we don't get undefined behaviors with unaligned types.
+*/
+template<unsigned nt, unsigned vt, typename T>
+__device__ auto cuda_load_two_streams_reg(
+  const T* a, unsigned a_count, const T* b, unsigned b_count, unsigned tid
+) {
+
+  b -= a_count;
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt>([&](auto i, auto index) {
+    const T* p = (index >= a_count) ? b : a;
+    x[i] = p[index];
+  }, tid, a_count + b_count);
+
+  return x;
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it>
+__device__
+std::enable_if_t<
+  !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value),
+  cudaArray<T, vt>
+> load_two_streams_reg(a_it a, unsigned a_count, b_it b, unsigned b_count, unsigned tid) {
+  b -= a_count;
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt>([&](auto i, auto index) {
+    x[i] = (index < a_count) ? a[index] : b[index];
+  }, tid, a_count + b_count);
+  return x;
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename A, typename B, typename T, unsigned S>
+__device__ void cuda_load_two_streams_shared(A a, unsigned a_count,
+  B b, unsigned b_count, unsigned tid, T (&shared)[S], bool sync = true
+) {
+  // Load into register then make an unconditional strided store into memory.
+  auto x = cuda_load_two_streams_reg<nt, vt, T>(a, a_count, b, b_count, tid);
+  cuda_reg_to_shared_strided<nt>(x, tid, shared, sync);
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename T>
+__device__ auto cuda_gather_two_streams_strided(const T* a,
+  unsigned a_count, const T* b, unsigned b_count, cudaArray<unsigned, vt> indices,
+  unsigned tid) {
+
+  ptrdiff_t b_offset = b - a - a_count;
+  auto count = a_count + b_count;
+
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt>([&](auto i, auto j) {
+    ptrdiff_t gather = indices[i];
+    if(gather >= a_count) gather += b_offset;
+    x[i] = a[gather];
+  }, tid, count);
+
+  return x;
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it>
+__device__
+std::enable_if_t<
+  !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value),
+  cudaArray<T, vt>
+> cuda_gather_two_streams_strided(a_it a,
+  unsigned a_count, b_it b, unsigned b_count, cudaArray<unsigned, vt> indices, unsigned tid) {
+
+  b -= a_count;
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt>([&](auto i, auto j) {
+    x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i]];
+  }, tid, a_count + b_count);
+
+  return x;
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename a_it, typename b_it, typename c_it>
+__device__ void cuda_transfer_two_streams_strided(
+  a_it a, unsigned a_count, b_it b, unsigned b_count,
+  cudaArray<unsigned, vt> indices, unsigned tid, c_it c
+) {
+
+  using T = typename std::iterator_traits<a_it>::value_type;
+  auto x = cuda_gather_two_streams_strided<nt, vt, T>(
+    a, a_count, b, b_count, indices, tid
+  );
+
+  cuda_reg_to_mem_strided<nt>(x, tid, a_count + b_count, c);
+}
+
+
+/**
+@private
+
+This function must be able to dereference keys[a_begin] and keys[b_begin],
+no matter the indices for each. The caller should allocate at least
+nt * vt + 1 elements for
+*/
+template<cudaMergeBoundType bounds, unsigned vt, typename T, typename comp_t>
+__device__ auto cuda_serial_merge(
+  const T* keys_shared, cudaMergeRange range, comp_t comp, bool sync = true
+) {
+
+  auto a_key = keys_shared[range.a_begin];
+  auto b_key = keys_shared[range.b_begin];
+
+  cudaMergePair<T, vt> merge_pair;
+  cuda_iterate<vt>([&](auto i) {
+    bool p = cuda_merge_predicate<bounds, true>(a_key, b_key, range, comp);
+    auto index = p ? range.a_begin : range.b_begin;
+
+    merge_pair.keys[i] = p ? a_key : b_key;
+    merge_pair.indices[i] = index;
+
+    T c_key = keys_shared[++index];
+    if(p) a_key = c_key, range.a_begin = index;
+    else b_key = c_key, range.b_begin = index;
+  });
+
+  if(sync) __syncthreads();
+  return merge_pair;
+}
+
+/**
+@private
+
+Load arrays a and b from global memory and merge unsignedo register.
+*/
+template<cudaMergeBoundType bounds,
+  unsigned nt, unsigned vt,
+  typename a_it, typename b_it, typename T, typename comp_t, unsigned S
+>
+__device__ auto block_merge_from_mem(
+  a_it a, b_it b, cudaMergeRange range_mem, unsigned tid, comp_t comp, T (&keys_shared)[S]
+) {
+
+  static_assert(S >= nt * vt + 1,
+    "block_merge_from_mem requires temporary storage of at "
+    "least nt * vt + 1 items");
+
+  // Load the data into shared memory.
+  cuda_load_two_streams_shared<nt, vt>(
+    a + range_mem.a_begin, range_mem.a_count(),
+    b + range_mem.b_begin, range_mem.b_count(),
+    tid, keys_shared, true
+  );
+
+  // Run a merge path to find the start of the serial merge for each thread.
+  auto range_local = range_mem.to_local();
+  auto diag = vt * tid;
+  auto mp = cuda_merge_path<bounds>(keys_shared, range_local, diag, comp);
+
+  // Compute the ranges of the sources in shared memory. The end iterators
+  // of the range are inaccurate, but still facilitate exact merging, because
+  // only vt elements will be merged.
+  auto merged = cuda_serial_merge<bounds, vt>(
+    keys_shared, range_local.partition(mp, diag), comp
+  );
+
+  return merged;
+};
+
+/** @private */
+template<cudaMergeBoundType bounds,
+  typename P, typename a_keys_it, typename b_keys_it, typename comp_t
+>
+void cuda_merge_path_partitions(
+  P&& p,
+  a_keys_it a, unsigned a_count,
+  b_keys_it b, unsigned b_count,
+  unsigned spacing,
+  comp_t comp,
+  unsigned* buf
+) {
+
+  //int num_partitions = (int)div_up(a_count + b_count, spacing) + 1;
+
+  unsigned num_partitions = (a_count + b_count + spacing - 1) / spacing + 1;
+
+  const unsigned nt = 128;
+  const unsigned vt = 1;
+  const unsigned nv = nt * vt;
+
+  unsigned B = (num_partitions + nv - 1) / nv;  // nt = 128, vt = 1
+
+  cuda_kernel<<<B, nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
+    auto range = cuda_get_tile(bid, nt * vt, num_partitions);
+    cuda_strided_iterate<nt, vt>([=](auto, auto j) {
+      auto index = range.begin + j;
+      auto diag = min(spacing * index, a_count + b_count);
+      buf[index] = cuda_merge_path<bounds>(a, a_count, b, b_count, diag, comp);
+    }, tid, range.count());
+  });
+}
+
+//template<typename segments_it>
+//auto load_balance_partitions(int64_t dest_count, segments_it segments,
+//  int num_segments, int spacing, context_t& context) ->
+//  mem_t<typename std::iterator_traits<segments_it>::value_type> {
+//
+//  typedef typename std::iterator_traits<segments_it>::value_type int_t;
+//  return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0),
+//    dest_count, segments, num_segments, spacing, less_t<int_t>(), context);
+//}
+
+//template<bounds_t bounds, typename keys_it>
+//mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items,
+//  int spacing, context_t& context) {
+//
+//  int num_partitions = div_up(count, spacing) + 1;
+//  mem_t<int> mem(num_partitions, context);
+//  int* p = mem.data();
+//  transform([=]MGPU_DEVICE(int index) {
+//    int key = min(spacing * index, count);
+//    p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>());
+//  }, num_partitions, context);
+//  return mem;
+//}
+
+/** @private */
+template<
+  typename P,
+  typename a_keys_it, typename a_vals_it,
+  typename b_keys_it, typename b_vals_it,
+  typename c_keys_it, typename c_vals_it,
+  typename comp_t
+>
+void cuda_merge_loop(
+  P&& p,
+  a_keys_it a_keys, a_vals_it a_vals, unsigned a_count,
+  b_keys_it b_keys, b_vals_it b_vals, unsigned b_count,
+  c_keys_it c_keys, c_vals_it c_vals,
+  comp_t comp,
+  void* ptr
+) {
+
+  using E = std::decay_t<P>;
+  using T = typename std::iterator_traits<a_keys_it>::value_type;
+  using V = typename std::iterator_traits<a_vals_it>::value_type;
+
+  auto buf = static_cast<unsigned*>(ptr);
+
+  auto has_values = !std::is_same<V, cudaEmpty>::value;
+
+  cuda_merge_path_partitions<cudaMergeBoundType::LOWER>(
+    p, a_keys, a_count, b_keys, b_count, E::nv, comp, buf
+  );
+
+  unsigned B = p.num_blocks(a_count + b_count);
+
+  // we use small kernel
+  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+    __shared__ union {
+      T keys[E::nv + 1];
+      unsigned indices[E::nv];
+    } shared;
+
+    // Load the range for this CTA and merge the values into register.
+    auto mp0 = buf[bid + 0];
+    auto mp1 = buf[bid + 1];
+    auto range = cuda_compute_merge_range(a_count, b_count, bid, E::nv, mp0, mp1);
+
+    auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>(
+      a_keys, b_keys, range, tid, comp, shared.keys
+    );
+
+    auto dest_offset = E::nv * bid;
+    cuda_reg_to_mem_thread<E::nt>(
+      merge.keys, tid, range.total(), c_keys + dest_offset, shared.keys
+    );
+
+    if(has_values) {
+      // Transpose the indices from thread order to strided order.
+      auto indices = cuda_reg_thread_to_strided<E::nt>(
+        merge.indices, tid, shared.indices
+      );
+
+      // Gather the input values and merge into the output values.
+      cuda_transfer_two_streams_strided<E::nt>(
+        a_vals + range.a_begin, range.a_count(),
+        b_vals + range.b_begin, range.b_count(), indices, tid,
+        c_vals + dest_offset
+      );
+    }
+  });
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// standalone merge algorithms
+// ----------------------------------------------------------------------------
+
+// Function: merge_bufsz
+template <unsigned NT, unsigned VT>  
+unsigned cudaExecutionPolicy<NT, VT>::merge_bufsz(unsigned a_count, unsigned b_count) {
+  return sizeof(unsigned) * (num_blocks(a_count + b_count + nv) + 1);
+}
+
+
+// ----------------------------------------------------------------------------
+// key-value merge
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous key-value merge over a range of keys and values
+
+@tparam P execution policy type
+@tparam a_keys_it first key iterator type
+@tparam a_vals_it first value iterator type
+@tparam b_keys_it second key iterator type
+@tparam b_vals_it second value iterator type
+@tparam c_keys_it output key iterator type
+@tparam c_vals_it output value iterator type
+@tparam C comparator type
+
+@param p execution policy
+@param a_keys_first iterator to the beginning of the first key range
+@param a_keys_last iterator to the end of the first key range
+@param a_vals_first iterator to the beginning of the first value range
+@param b_keys_first iterator to the beginning of the second key range
+@param b_keys_last iterator to the end of the second key range
+@param b_vals_first iterator to the beginning of the second value range
+@param c_keys_first iterator to the beginning of the output key range
+@param c_vals_first iterator to the beginning of the output value range
+@param comp comparator
+@param buf pointer to the temporary buffer
+
+Performs a key-value merge that copies elements from
+<tt>[a_keys_first, a_keys_last)</tt> and <tt>[b_keys_first, b_keys_last)</tt>
+into a single range, <tt>[c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))</tt>
+such that the resulting range is in ascending key order.
+
+At the same time, the merge copies elements from the two associated ranges
+<tt>[a_vals_first + (a_keys_last - a_keys_first))</tt> and
+<tt>[b_vals_first + (b_keys_last - b_keys_first))</tt> into a single range,
+<tt>[c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))</tt>
+such that the resulting range is in ascending order
+implied by each input element's associated key.
+
+For example, assume:
+  + @c a_keys = {1, 8};
+  + @c a_vals = {2, 1};
+  + @c b_keys = {3, 7};
+  + @c b_vals = {3, 4};
+
+After the merge, we have:
+  + @c c_keys = {1, 3, 7, 8}
+  + @c c_vals = {2, 3, 4, 1}
+
+*/
+template<
+  typename P,
+  typename a_keys_it, typename a_vals_it,
+  typename b_keys_it, typename b_vals_it,
+  typename c_keys_it, typename c_vals_it,
+  typename C
+>
+void cuda_merge_by_key(
+  P&& p,
+  a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
+  b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
+  c_keys_it c_keys_first, c_vals_it c_vals_first, 
+  C comp,
+  void* buf
+) {
+
+  unsigned a_count = std::distance(a_keys_first, a_keys_last);
+  unsigned b_count = std::distance(b_keys_first, b_keys_last);
+
+  if(a_count + b_count == 0) {
+    return;
+  }
+
+  detail::cuda_merge_loop(p,
+    a_keys_first, a_vals_first, a_count,
+    b_keys_first, b_vals_first, b_count,
+    c_keys_first, c_vals_first, comp,
+    buf
+  );
+}
+
+// ----------------------------------------------------------------------------
+// key-only merge
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous key-only merge over a range of keys
+
+@tparam P execution policy type
+@tparam a_keys_it first key iterator type
+@tparam b_keys_it second key iterator type
+@tparam c_keys_it output key iterator type
+@tparam C comparator type
+
+@param p execution policy
+@param a_keys_first iterator to the beginning of the first key range
+@param a_keys_last iterator to the end of the first key range
+@param b_keys_first iterator to the beginning of the second key range
+@param b_keys_last iterator to the end of the second key range
+@param c_keys_first iterator to the beginning of the output key range
+@param comp comparator
+@param buf pointer to the temporary buffer
+
+This function is equivalent to tf::cuda_merge_by_key without values.
+
+*/
+template<typename P,
+  typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C
+>
+void cuda_merge(
+  P&& p,
+  a_keys_it a_keys_first, a_keys_it a_keys_last,
+  b_keys_it b_keys_first, b_keys_it b_keys_last,
+  c_keys_it c_keys_first,
+  C comp,
+  void* buf
+) {
+  cuda_merge_by_key(
+    p,
+    a_keys_first, a_keys_last, (const cudaEmpty*)nullptr,
+    b_keys_first, b_keys_last, (const cudaEmpty*)nullptr,
+    c_keys_first, (cudaEmpty*)nullptr, comp,
+    buf
+  );
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp b/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp
new file mode 100644
index 0000000..d6ba332
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp
@@ -0,0 +1,460 @@
+#pragma once
+
+#include "../cudaflow.hpp"
+
+/**
+@file taskflow/cuda/algorithm/reduce.hpp
+@brief cuda reduce algorithms include file
+*/
+
+namespace tf::detail {
+
+// ----------------------------------------------------------------------------
+// reduction helper functions
+// ----------------------------------------------------------------------------
+
+/** @private */
+template<unsigned nt, typename T>
+struct cudaBlockReduce {
+
+  static const unsigned group_size = std::min(nt, CUDA_WARP_SIZE);
+  static const unsigned num_passes = log2(group_size);
+  static const unsigned num_items = nt / group_size;
+
+  static_assert(
+    nt && (0 == nt % CUDA_WARP_SIZE),
+    "cudaBlockReduce requires num threads to be a multiple of warp_size (32)"
+  );
+
+  /** @private */
+  struct Storage {
+    T data[std::max(nt, 2 * group_size)];
+  };
+
+  template<typename op_t>
+  __device__ T operator()(unsigned, T, Storage&, unsigned, op_t, bool = true) const;
+};
+
+// function: reduce to be called from a block
+template<unsigned nt, typename T>
+template<typename op_t>
+__device__ T cudaBlockReduce<nt, T>::operator ()(
+  unsigned tid, T x, Storage& storage, unsigned count, op_t op, bool ret
+) const {
+
+  // Store your data into shared memory.
+  storage.data[tid] = x;
+  __syncthreads();
+
+  if(tid < group_size) {
+    // Each thread scans within its lane.
+    cuda_strided_iterate<group_size, num_items>([&](auto i, auto j) {
+      if(i > 0) {
+        x = op(x, storage.data[j]);
+      }
+    }, tid, count);
+    storage.data[tid] = x;
+  }
+  __syncthreads();
+
+  auto count2 = count < group_size ? count : group_size;
+  auto first = (1 & num_passes) ? group_size : 0;
+  if(tid < group_size) {
+    storage.data[first + tid] = x;
+  }
+  __syncthreads();
+
+  cuda_iterate<num_passes>([&](auto pass) {
+    if(tid < group_size) {
+      if(auto offset = 1 << pass; tid + offset < count2) {
+        x = op(x, storage.data[first + offset + tid]);
+      }
+      first = group_size - first;
+      storage.data[first + tid] = x;
+    }
+    __syncthreads();
+  });
+
+  if(ret) {
+    x = storage.data[0];
+    __syncthreads();
+  }
+  return x;
+}
+
+// ----------------------------------------------------------------------------
+// cuda_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@private 
+*/
+template <size_t nt, size_t vt, typename I, typename T, typename O>
+__global__ void cuda_reduce_kernel(
+  I input, unsigned count, T* res, O op, void* ptr
+) {
+  
+  using U = typename std::iterator_traits<I>::value_type;
+
+  __shared__ typename cudaBlockReduce<nt, U>::Storage shm;
+  
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  auto x = cuda_mem_to_reg_strided<nt, vt>(
+    input + tile.begin, tid, tile.count()
+  );
+
+  // reduce multiple values per thread into a scalar.
+  U s;
+  cuda_strided_iterate<nt, vt>(
+    [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
+  );
+  // reduce to a scalar per block.
+  s = cudaBlockReduce<nt, U>()(
+    tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false
+  );
+
+  if(!tid) {
+    auto buf = static_cast<U*>(ptr);
+    (count <= nt*vt) ? *res = op(*res, s) : buf[bid] = s;
+  }
+}
+
+/** @private */
+template <typename P, typename I, typename T, typename O>
+void cuda_reduce_loop(
+  P&& p, I input, unsigned count, T* res, O op, void* ptr
+) {
+
+  using U = typename std::iterator_traits<I>::value_type;
+  using E = std::decay_t<P>;
+
+  auto buf = static_cast<U*>(ptr);
+  auto B = E::num_blocks(count);
+
+  cuda_reduce_kernel<E::nt, E::vt><<<B, E::nt, 0, p.stream()>>>(
+    input, count, res, op, ptr
+  );
+
+  if(B > 1) {
+    cuda_reduce_loop(p, buf, B, res, op, buf+B);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// cuda_uninitialized_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+template <size_t nt, size_t vt, typename I, typename T, typename O>
+__global__ void cuda_uninitialized_reduce_kernel(
+  I input, unsigned count, T* res, O op, void* ptr
+) {
+
+  using U = typename std::iterator_traits<I>::value_type;
+
+  __shared__ typename cudaBlockReduce<nt, U>::Storage shm;
+
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  auto x = cuda_mem_to_reg_strided<nt, vt>(
+    input + tile.begin, tid, tile.count()
+  );
+
+  // reduce multiple values per thread into a scalar.
+  U s;
+  cuda_strided_iterate<nt, vt>(
+    [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
+  );
+
+  // reduce to a scalar per block.
+  s = cudaBlockReduce<nt, U>()(
+    tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false
+  );
+
+  if(!tid) {
+    auto buf = static_cast<U*>(ptr);
+    (count <= nt*vt) ? *res = s : buf[bid] = s;
+  }
+}
+
+/** 
+@private 
+*/
+template <typename P, typename I, typename T, typename O>
+void cuda_uninitialized_reduce_loop(
+  P&& p, I input, unsigned count, T* res, O op, void* ptr
+) {
+
+  using U = typename std::iterator_traits<I>::value_type;
+  using E = std::decay_t<P>;
+
+  auto buf = static_cast<U*>(ptr);
+  auto B = (count + E::nv - 1) / E::nv;
+
+  cuda_uninitialized_reduce_kernel<E::nt, E:: vt><<<B, E::nt, 0, p.stream()>>>(
+    input, count, res, op, buf
+  );
+
+  if(B > 1) {
+    cuda_uninitialized_reduce_loop(p, buf, B, res, op, buf+B);
+  }
+}
+
+}  // namespace tf::detail ----------------------------------------------------
+
+namespace tf {
+
+// Function: reduce_bufsz
+template <unsigned NT, unsigned VT>  
+template <typename T>
+unsigned cudaExecutionPolicy<NT, VT>::reduce_bufsz(unsigned count) {
+  unsigned B = num_blocks(count);
+  unsigned n = 0;
+  while(B > 1) {
+    n += B;
+    B = num_blocks(B);
+  }
+  return n*sizeof(T);
+}
+
+// ----------------------------------------------------------------------------
+// cuda_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous parallel reduction over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+@param buf pointer to the temporary buffer
+
+This method is equivalent to the parallel execution of the following loop on a GPU:
+
+@code{.cpp}
+while (first != last) {
+  *result = op(*result, *first++);
+}
+@endcode
+ */
+template <typename P, typename I, typename T, typename O>
+void cuda_reduce(
+  P&& p, I first, I last, T* res, O op, void* buf
+) {
+  unsigned count = std::distance(first, last);
+  if(count == 0) {
+    return;
+  }
+  detail::cuda_reduce_loop(p, first, count, res, op, buf);
+}
+
+// ----------------------------------------------------------------------------
+// cuda_uninitialized_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous parallel reduction over a range of items without
+       an initial value
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+@param buf pointer to the temporary buffer
+
+This method is equivalent to the parallel execution of the following loop
+on a GPU:
+
+@code{.cpp}
+*result = *first++;  // no initial values partitipcate in the loop
+while (first != last) {
+  *result = op(*result, *first++);
+}
+@endcode
+*/
+template <typename P, typename I, typename T, typename O>
+void cuda_uninitialized_reduce(
+  P&& p, I first, I last, T* res, O op, void* buf
+) {
+  unsigned count = std::distance(first, last);
+  if(count == 0) {
+    return;
+  }
+  detail::cuda_uninitialized_reduce_loop(p, first, count, res, op, buf);
+}
+
+// ----------------------------------------------------------------------------
+// transform_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous parallel reduction over a range of transformed items
+       without an initial value
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+@tparam U unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param bop binary operator to apply to reduce elements
+@param uop unary operator to apply to transform elements
+@param buf pointer to the temporary buffer
+
+This method is equivalent to the parallel execution of the following loop on a GPU:
+
+@code{.cpp}
+while (first != last) {
+  *result = bop(*result, uop(*first++));
+}
+@endcode
+*/
+template<typename P, typename I, typename T, typename O, typename U>
+void cuda_transform_reduce(
+  P&& p, I first, I last, T* res, O bop, U uop, void* buf
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // reduction loop
+  detail::cuda_reduce_loop(p,
+    cuda_make_load_iterator<T>([=]__device__(auto i){
+      return uop(*(first+i));
+    }),
+    count, res, bop, buf
+  );
+}
+
+// ----------------------------------------------------------------------------
+// transform_uninitialized_reduce
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous parallel reduction over a range of transformed items
+       with an initial value
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+@tparam U unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param bop binary operator to apply to reduce elements
+@param uop unary operator to apply to transform elements
+@param buf pointer to the temporary buffer
+
+This method is equivalent to the parallel execution of the following loop
+on a GPU:
+
+@code{.cpp}
+*result = uop(*first++);  // no initial values partitipcate in the loop
+while (first != last) {
+  *result = bop(*result, uop(*first++));
+}
+@endcode
+*/
+template<typename P, typename I, typename T, typename O, typename U>
+void cuda_uninitialized_transform_reduce(
+  P&& p, I first, I last, T* res, O bop, U uop, void* buf
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  detail::cuda_uninitialized_reduce_loop(p,
+    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
+    count, res, bop, buf
+  );
+}
+
+// ----------------------------------------------------------------------------
+
+//template <typename T, typename C>
+//__device__ void cuda_warp_reduce(
+//  volatile T* shm, size_t N, size_t tid, C op
+//) {
+//  if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]);
+//  if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]);
+//  if(tid +  8 < N) shm[tid] = op(shm[tid], shm[tid+8]);
+//  if(tid +  4 < N) shm[tid] = op(shm[tid], shm[tid+4]);
+//  if(tid +  2 < N) shm[tid] = op(shm[tid], shm[tid+2]);
+//  if(tid +  1 < N) shm[tid] = op(shm[tid], shm[tid+1]);
+//}
+//
+//template <typename I, typename T, typename C, bool uninitialized>
+//__global__ void cuda_reduce(I first, size_t N, T* res, C op) {
+//
+//  size_t tid = threadIdx.x;
+//
+//  if(tid >= N) {
+//    return;
+//  }
+//
+//  cudaSharedMemory<T> shared_memory;
+//  T* shm = shared_memory.get();
+//
+//  shm[tid] = *(first+tid);
+//
+//  for(size_t i=tid+blockDim.x; i<N; i+=blockDim.x) {
+//    shm[tid] = op(shm[tid], *(first+i));
+//  }
+//
+//  __syncthreads();
+//
+//  for(size_t s = blockDim.x / 2; s > 32; s >>= 1) {
+//    if(tid < s && tid + s < N) {
+//      shm[tid] = op(shm[tid], shm[tid+s]);
+//    }
+//    __syncthreads();
+//  }
+//
+//  if(tid < 32) {
+//    cuda_warp_reduce(shm, N, tid, op);
+//  }
+//
+//  if(tid == 0) {
+//    if constexpr (uninitialized) {
+//      *res = shm[0];
+//    }
+//    else {
+//      *res = op(*res, shm[0]);
+//    }
+//  }
+//}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp b/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp
new file mode 100644
index 0000000..bce0d63
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp
@@ -0,0 +1,488 @@
+#pragma once
+
+#include "reduce.hpp"
+
+/**
+@file taskflow/cuda/algorithm/scan.hpp
+@brief CUDA scan algorithm include file
+*/
+
+namespace tf::detail {
+
+// ----------------------------------------------------------------------------
+// scan
+// ----------------------------------------------------------------------------
+
+/** @private */
+inline constexpr unsigned cudaScanRecursionThreshold = 8;
+
+/** @private */
+enum class cudaScanType : int {
+  EXCLUSIVE = 1,
+  INCLUSIVE
+};
+
+/** @private */
+template<typename T, unsigned vt = 0, bool is_array = (vt > 0)>
+struct cudaScanResult {
+  T scan;
+  T reduction;
+};
+
+/** @private */
+template<typename T, unsigned vt>
+struct cudaScanResult<T, vt, true> {
+  cudaArray<T, vt> scan;
+  T reduction;
+};
+
+//-----------------------------------------------------------------------------
+
+/** @private */
+template<unsigned nt, typename T>
+struct cudaBlockScan {
+
+  const static unsigned num_warps  = nt / CUDA_WARP_SIZE;
+  const static unsigned num_passes = log2(nt);
+  const static unsigned capacity   = nt + num_warps;
+
+  /** @private */
+  union storage_t {
+    T data[2 * nt];
+    struct { T threads[nt], warps[num_warps]; };
+  };
+
+  // standard scan
+  template<typename op_t>
+  __device__ cudaScanResult<T> operator ()(
+    unsigned tid,
+    T x,
+    storage_t& storage,
+    unsigned count = nt,
+    op_t op = op_t(),
+    T init = T(),
+    cudaScanType type = cudaScanType::EXCLUSIVE
+  ) const;
+
+  // vectorized scan. accepts multiple values per thread and adds in
+  // optional global carry-in
+  template<unsigned vt, typename op_t>
+  __device__ cudaScanResult<T, vt> operator()(
+    unsigned tid,
+    cudaArray<T, vt> x,
+    storage_t& storage,
+    T carry_in = T(),
+    bool use_carry_in = false,
+    unsigned count = nt,
+    op_t op = op_t(),
+    T init = T(),
+    cudaScanType type = cudaScanType::EXCLUSIVE
+  ) const;
+};
+
+// standard scan
+template <unsigned nt, typename T>
+template<typename op_t>
+__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (
+  unsigned tid, T x, storage_t& storage, unsigned count, op_t op,
+  T init, cudaScanType type
+) const {
+
+  unsigned first = 0;
+  storage.data[first + tid] = x;
+  __syncthreads();
+
+  cuda_iterate<num_passes>([&](auto pass) {
+    if(auto offset = 1<<pass; tid >= offset) {
+      x = op(storage.data[first + tid - offset], x);
+    }
+    first = nt - first;
+    storage.data[first + tid] = x;
+    __syncthreads();
+  });
+
+  cudaScanResult<T> result;
+  result.reduction = storage.data[first + count - 1];
+  result.scan = (tid < count) ?
+    (cudaScanType::INCLUSIVE == type ? x :
+      (tid ? storage.data[first + tid - 1] : init)) :
+    result.reduction;
+  __syncthreads();
+
+  return result;
+}
+
+// vectorized scan block
+template <unsigned nt, typename T>
+template<unsigned vt, typename op_t>
+__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(
+  unsigned tid,
+  cudaArray<T, vt> x,
+  storage_t& storage,
+  T carry_in,
+  bool use_carry_in,
+  unsigned count, op_t op,
+  T init,
+  cudaScanType type
+) const {
+
+  // Start with an inclusive scan of the in-range elements.
+  if(count >= nt * vt) {
+    cuda_iterate<vt>([&](auto i) {
+      x[i] = i ? op(x[i], x[i - 1]) : x[i];
+    });
+  } else {
+    cuda_iterate<vt>([&](auto i) {
+      auto index = vt * tid + i;
+      x[i] = i ?
+        ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :
+        (x[i] = (index < count) ? x[i] : init);
+    });
+  }
+
+  // Scan the thread-local reductions for a carry-in for each thread.
+  auto result = operator()(
+    tid, x[vt - 1], storage,
+    (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE
+  );
+
+  // Perform the scan downsweep and add both the global carry-in and the
+  // thread carry-in to the values.
+  if(use_carry_in) {
+    result.reduction = op(carry_in, result.reduction);
+    result.scan = tid ? op(carry_in, result.scan) : carry_in;
+  } else {
+    use_carry_in = tid > 0;
+  }
+
+  cudaArray<T, vt> y;
+  cuda_iterate<vt>([&](auto i) {
+    if(cudaScanType::EXCLUSIVE == type) {
+      y[i] = i ? x[i - 1] : result.scan;
+      if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);
+    } else {
+      y[i] = use_carry_in ? op(x[i], result.scan) : x[i];
+    }
+  });
+
+  return cudaScanResult<T, vt> { y, result.reduction };
+}
+
+/**
+@private
+@brief single-pass scan for small input
+ */
+template <typename P, typename I, typename O, typename C>
+void cuda_single_pass_scan(
+  P&& p,
+  cudaScanType scan_type,
+  I input,
+  unsigned count,
+  O output,
+  C op
+  //reduction_it reduction,
+) {
+
+  using T = typename std::iterator_traits<O>::value_type;
+  using E = std::decay_t<P>;
+
+  // Small input specialization. This is the non-recursive branch.
+  cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+    using scan_t = cudaBlockScan<E::nt, T>;
+
+    __shared__ union {
+      typename scan_t::storage_t scan;
+      T values[E::nv];
+    } shared;
+
+    auto carry_in = T();
+    for(unsigned cur = 0; cur < count; cur += E::nv) {
+      // Cooperatively load values into register.
+      auto count2 = min(count - cur, E::nv);
+
+      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,
+        tid, count2, shared.values);
+
+      auto result = scan_t()(tid, x, shared.scan,
+        carry_in, cur > 0, count2, op, T(), scan_type);
+
+      // Store the scanned values back to global memory.
+      cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,
+        output + cur, shared.values);
+
+      // Roll the reduction into carry_in.
+      carry_in = result.reduction;
+    }
+
+    // Store the carry-out to the reduction pointer. This may be a
+    // discard_iterator_t if no reduction is wanted.
+    //if(!tid) *reduction = carry_in;
+  });
+}
+
+/**
+@private
+
+@brief main scan loop
+*/
+template<typename P, typename I, typename O, typename C>
+void cuda_scan_loop(
+  P&& p,
+  cudaScanType scan_type,
+  I input,
+  unsigned count,
+  O output,
+  C op,
+  //reduction_it reduction,
+  void* ptr
+) {
+
+  using E = std::decay_t<P>;
+  using T = typename std::iterator_traits<O>::value_type;
+
+  T* buffer = static_cast<T*>(ptr);
+
+  //launch_t::cta_dim(context).B(count);
+  unsigned B = (count + E::nv - 1) / E::nv;
+
+  if(B > cudaScanRecursionThreshold) {
+
+    //cudaDeviceVector<T> partials(B);
+    //auto buffer = partials.data();
+
+    // upsweep phase
+    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+      __shared__ typename cudaBlockReduce<E::nt, T>::Storage shm;
+
+      // Load the tile's data into register.
+      auto tile = cuda_get_tile(bid, E::nv, count);
+      auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
+        input + tile.begin, tid, tile.count()
+      );
+
+      // Reduce the thread's values into a scalar.
+      T scalar;
+      cuda_strided_iterate<E::nt, E::vt>(
+        [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },
+        tid, tile.count()
+      );
+
+      // Reduce across all threads.
+      auto all_reduce = cudaBlockReduce<E::nt, T>()(
+        tid, scalar, shm, tile.count(), op
+      );
+
+      // Store the final reduction to the partials.
+      if(!tid) {
+        buffer[bid] = all_reduce;
+      }
+    });
+
+    // recursively call scan
+    //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S);
+    cuda_scan_loop(
+      p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B
+    );
+
+    // downsweep: perform an intra-tile scan and add the scan of the partials
+    // as carry-in
+    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+      using scan_t = cudaBlockScan<E::nt, T>;
+
+      __shared__ union {
+        typename scan_t::storage_t scan;
+        T values[E::nv];
+      } shared;
+
+      // Load a tile to register in thread order.
+      auto tile = cuda_get_tile(bid, E::nv, count);
+      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(
+        input + tile.begin, tid, tile.count(), shared.values
+      );
+
+      // Scan the array with carry-in from the partials.
+      auto y = scan_t()(tid, x, shared.scan,
+        buffer[bid], bid > 0, tile.count(), op, T(),
+        scan_type).scan;
+
+      // Store the scanned values to the output.
+      cuda_reg_to_mem_thread<E::nt, E::vt>(
+        y, tid, tile.count(), output + tile.begin, shared.values
+      );
+    });
+  }
+  // Small input specialization. This is the non-recursive branch.
+  else {
+    cuda_single_pass_scan(p, scan_type, input, count, output, op);
+  }
+}
+
+}  // namespace tf::detail ----------------------------------------------------
+
+namespace tf {
+
+// Function: scan_bufsz
+template <unsigned NT, unsigned VT>  
+template <typename T>
+unsigned cudaExecutionPolicy<NT, VT>::scan_bufsz(unsigned count) {
+  unsigned B = num_blocks(count);
+  unsigned n = 0;
+  for(auto b=B; b>detail::cudaScanRecursionThreshold; b=num_blocks(b)) {
+    n += b;
+  }
+  return n*sizeof(T);
+}
+
+
+/**
+@brief performs asynchronous inclusive scan over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator
+@tparam O output iterator
+@tparam C binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the input range
+@param last iterator to the end of the input range
+@param output iterator to the beginning of the output range
+@param op binary operator to apply to scan
+@param buf pointer to the temporary buffer
+
+*/
+template<typename P, typename I, typename O, typename C>
+void cuda_inclusive_scan(
+  P&& p, I first, I last, O output, C op, void* buf
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // launch the scan loop
+  detail::cuda_scan_loop(
+    p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf
+  );
+}
+
+/**
+@brief performs asynchronous inclusive scan over a range of transformed items
+
+@tparam P execution policy type
+@tparam I input iterator
+@tparam O output iterator
+@tparam C binary operator type
+@tparam U unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the input range
+@param last iterator to the end of the input range
+@param output iterator to the beginning of the output range
+@param bop binary operator to apply to scan
+@param uop unary operator to apply to transform each item before scan
+@param buf pointer to the temporary buffer
+
+*/
+template<typename P, typename I, typename O, typename C, typename U>
+void cuda_transform_inclusive_scan(
+  P&& p, I first, I last, O output, C bop, U uop, void* buf
+) {
+
+  using T = typename std::iterator_traits<O>::value_type;
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // launch the scan loop
+  detail::cuda_scan_loop(
+    p, detail::cudaScanType::INCLUSIVE,
+    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
+    count, output, bop, buf
+  );
+}
+
+/**
+@brief performs asynchronous exclusive scan over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator
+@tparam O output iterator
+@tparam C binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the input range
+@param last iterator to the end of the input range
+@param output iterator to the beginning of the output range
+@param op binary operator to apply to scan
+@param buf pointer to the temporary buffer
+
+*/
+template<typename P, typename I, typename O, typename C>
+void cuda_exclusive_scan(
+  P&& p, I first, I last, O output, C op, void* buf
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // launch the scan loop
+  detail::cuda_scan_loop(
+    p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf
+  );
+}
+
+/**
+@brief performs asynchronous exclusive scan over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator
+@tparam O output iterator
+@tparam C binary operator type
+@tparam U unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the input range
+@param last iterator to the end of the input range
+@param output iterator to the beginning of the output range
+@param bop binary operator to apply to scan
+@param uop unary operator to apply to transform each item before scan
+@param buf pointer to the temporary buffer
+
+*/
+template<typename P, typename I, typename O, typename C, typename U>
+void cuda_transform_exclusive_scan(
+  P&& p, I first, I last, O output, C bop, U uop, void* buf
+) {
+
+  using T = typename std::iterator_traits<O>::value_type;
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // launch the scan loop
+  detail::cuda_scan_loop(
+    p, detail::cudaScanType::EXCLUSIVE,
+    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
+    count, output, bop, buf
+  );
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp b/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp
new file mode 100644
index 0000000..3cc01d5
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp
@@ -0,0 +1,506 @@
+#pragma once
+
+#include "merge.hpp"
+
+/**
+@file taskflow/cuda/algorithm/sort.hpp
+@brief CUDA sort algorithm include file
+*/
+
+namespace tf::detail {
+
+// ----------------------------------------------------------------------------
+// odd-even sort in register
+// ----------------------------------------------------------------------------
+
+/**
+@private
+@brief counts the number of leading zeros starting from the most significant bit
+*/
+constexpr int cuda_clz(int x) {
+  for(int i = 31; i >= 0; --i) {
+    if((1<< i) & x) {
+      return 31 - i;
+    }
+  }
+  return 32;
+}
+
+/**
+@private
+@brief finds log2(x) and optionally round up to the next integer logarithm.
+*/
+constexpr int cuda_find_log2(int x, bool round_up = false) {
+  int a = 31 - cuda_clz(x);
+  if(round_up) {
+    a += !is_pow2(x);
+  }
+  return a;
+}
+
+/** @private */
+template<typename T, unsigned vt, typename C>
+__device__ auto cuda_odd_even_sort(
+  cudaArray<T, vt> x, C comp, int flags = 0
+) {
+  cuda_iterate<vt>([&](auto I) {
+    #pragma unroll
+    for(auto i = 1 & I; i < vt - 1; i += 2) {
+      if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i]))
+        cuda_swap(x[i], x[i + 1]);
+    }
+  });
+  return x;
+}
+
+/** @private */
+template<typename K, typename V, unsigned vt, typename C>
+__device__ auto cuda_odd_even_sort(
+  cudaKVArray<K, V, vt> x, C comp, int flags = 0
+) {
+  cuda_iterate<vt>([&](auto I) {
+    #pragma unroll
+    for(auto i = 1 & I; i < vt - 1; i += 2) {
+      if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) {
+        cuda_swap(x.keys[i], x.keys[i + 1]);
+        cuda_swap(x.vals[i], x.vals[i + 1]);
+      }
+    }
+  });
+  return x;
+}
+
+// ----------------------------------------------------------------------------
+// range check
+// ----------------------------------------------------------------------------
+
+/** @private */
+__device__ inline int cuda_out_of_range_flags(int first, int vt, int count) {
+  int out_of_range = min(vt, first + vt - count);
+  int head_flags = 0;
+  if(out_of_range > 0) {
+    const int mask = (1<< vt) - 1;
+    head_flags = mask & (~mask>> out_of_range);
+  }
+  return head_flags;
+}
+
+/** @private */
+__device__ inline auto cuda_compute_merge_sort_frame(
+  unsigned partition, unsigned coop, unsigned spacing
+) {
+
+  unsigned size = spacing * (coop / 2);
+  unsigned start = ~(coop - 1) & partition;
+  unsigned a_begin = spacing * start;
+  unsigned b_begin = spacing * start + size;
+
+  return cudaMergeRange {
+    a_begin,
+    a_begin + size,
+    b_begin,
+    b_begin + size
+  };
+}
+
+/** @private */
+__device__ inline auto cuda_compute_merge_sort_range(
+  unsigned count, unsigned partition, unsigned coop, unsigned spacing
+) {
+
+  auto frame = cuda_compute_merge_sort_frame(partition, coop, spacing);
+
+  return cudaMergeRange {
+    frame.a_begin,
+    min(count, frame.a_end),
+    min(count, frame.b_begin),
+    min(count, frame.b_end)
+  };
+}
+
+/** @private */
+__device__ inline auto cuda_compute_merge_sort_range(
+  unsigned count, unsigned partition, unsigned coop, unsigned spacing,
+  unsigned mp0, unsigned mp1
+) {
+
+  auto range = cuda_compute_merge_sort_range(count, partition, coop, spacing);
+
+  // Locate the diagonal from the start of the A sublist.
+  unsigned diag = spacing * partition - range.a_begin;
+
+  // The end partition of the last cta for each merge operation is computed
+  // and stored as the begin partition for the subsequent merge. i.e. it is
+  // the same partition but in the wrong coordinate system, so its 0 when it
+  // should be listSize. Correct that by checking if this is the last cta
+  // in this merge operation.
+  if(coop - 1 != ((coop - 1) & partition)) {
+    range.a_end = range.a_begin + mp1;
+    range.b_end = min(count, range.b_begin + diag + spacing - mp1);
+  }
+
+  range.a_begin = range.a_begin + mp0;
+  range.b_begin = min(count, range.b_begin + diag - mp0);
+
+  return range;
+}
+
+/** @private */
+template<unsigned nt, unsigned vt, typename K, typename V>
+struct cudaBlockSort {
+
+  static constexpr bool has_values = !std::is_same<V, cudaEmpty>::value;
+  static constexpr unsigned num_passes = log2(nt);
+
+  /** @private */
+  union Storage {
+    K keys[nt * vt + 1];
+    V vals[nt * vt];
+  };
+
+  static_assert(is_pow2(nt), "cudaBlockSort requires pow2 number of threads");
+
+  template<typename C>
+  __device__ auto merge_pass(
+    cudaKVArray<K, V, vt> x,
+    unsigned tid, unsigned count, unsigned pass,
+    C comp, Storage& storage
+  ) const {
+
+    // Divide the CTA's keys into lists.
+    unsigned coop = 2 << pass;
+    auto range = cuda_compute_merge_sort_range(count, tid, coop, vt);
+    unsigned diag = vt * tid - range.a_begin;
+
+    // Store the keys into shared memory for searching.
+    cuda_reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
+
+    // Search for the merge path for this thread within its list.
+    auto mp = cuda_merge_path<cudaMergeBoundType::LOWER>(
+      storage.keys, range, diag, comp
+    );
+
+    // Run a serial merge and return.
+    auto merge = cuda_serial_merge<cudaMergeBoundType::LOWER, vt>(
+      storage.keys, range.partition(mp, diag), comp
+    );
+    x.keys = merge.keys;
+
+    if(has_values) {
+      // Reorder values through shared memory.
+      cuda_reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
+      x.vals = cuda_shared_gather<nt, vt>(storage.vals, merge.indices);
+    }
+
+    return x;
+  }
+
+  template<typename C>
+  __device__ auto block_sort(cudaKVArray<K, V, vt> x,
+    unsigned tid, unsigned count, C comp, Storage& storage
+  ) const {
+
+    // Sort the inputs within each thread. If any threads have fewer than
+    // vt items, use the segmented sort network to prevent out-of-range
+    // elements from contaminating the sort.
+    if(count < nt * vt) {
+      auto head_flags = cuda_out_of_range_flags(vt * tid, vt, count);
+      x = cuda_odd_even_sort(x, comp, head_flags);
+    } else {
+      x = cuda_odd_even_sort(x, comp);
+    }
+
+    // Merge threads starting with a pair until all values are merged.
+    for(unsigned pass = 0; pass < num_passes; ++pass) {
+      x = merge_pass(x, tid, count, pass, comp, storage);
+    }
+
+    return x;
+  }
+};
+
+/** @private */
+template<typename P, typename K, typename C>
+void cuda_merge_sort_partitions(
+  P&& p, K keys, unsigned count,
+  unsigned coop, unsigned spacing, C comp, unsigned* buf
+) {
+
+  // bufer size is num_partitions + 1
+  unsigned num_partitions = (count + spacing - 1) / spacing + 1;
+
+  const unsigned nt = 128;
+  const unsigned vt = 1;
+  const unsigned nv = nt * vt;
+
+  unsigned B = (num_partitions + nv - 1) / nv;  // nt = 128, vt = 1
+
+  cuda_kernel<<<B, nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+    auto range = cuda_get_tile(bid, nt * vt, num_partitions);
+    cuda_strided_iterate<nt, vt>([=](auto, auto j) {
+      auto index = j + range.begin;
+      auto range = cuda_compute_merge_sort_range(count, index, coop, spacing);
+      auto diag = min(spacing * index, count) - range.a_begin;
+      buf[index] = cuda_merge_path<cudaMergeBoundType::LOWER>(
+        keys + range.a_begin, range.a_count(),
+        keys + range.b_begin, range.b_count(),
+        diag, comp
+      );
+    }, tid, range.count());
+  });
+}
+
+/** @private */
+template<typename P, typename K_it, typename V_it, typename C>
+void merge_sort_loop(
+  P&& p, K_it keys_input, V_it vals_input, unsigned count, C comp, void* buf
+) {
+
+  using K = typename std::iterator_traits<K_it>::value_type;
+  using V = typename std::iterator_traits<V_it>::value_type;
+  using E = std::decay_t<P>;
+
+  const bool has_values = !std::is_same<V, cudaEmpty>::value;
+
+  unsigned B = (count + E::nv - 1) / E::nv;
+  unsigned R = cuda_find_log2(B, true);
+
+  K* keys_output    {nullptr};
+  V* vals_output    {nullptr};
+  unsigned *mp_data {nullptr};
+
+  if(R) {
+    keys_output = (K*)(buf);
+    if(has_values) {
+      vals_output = (V*)(keys_output + count);
+      mp_data = (unsigned*)(vals_output + count);
+    }
+    else {
+      mp_data = (unsigned*)(keys_output + count);
+    }
+  }
+
+  //cudaDeviceVector<K> keys_temp(R ? count : 0);
+  //auto keys_output = keys_temp.data();
+  ////std::cout << "keys_output = " << keys_temp.size()*sizeof(K) << std::endl;
+
+  //cudaDeviceVector<V> vals_temp((has_values && R) ? count : 0);
+  //auto vals_output = vals_temp.data();
+  //std::cout << "vals_output = " << vals_temp.size()*sizeof(V) << std::endl;
+
+  auto keys_blocksort = (1 & R) ? keys_output : keys_input;
+  auto vals_blocksort = (1 & R) ? vals_output : vals_input;
+
+  //printf("B=%u, R=%u\n", B, R);
+
+  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
+
+    using sort_t = cudaBlockSort<E::nt, E::vt, K, V>;
+
+    __shared__ union {
+      typename sort_t::Storage sort;
+      K keys[E::nv];
+      V vals[E::nv];
+    } shared;
+
+    auto tile = cuda_get_tile(bid, E::nv, count);
+
+    // Load the keys and values.
+    cudaKVArray<K, V, E::vt> unsorted;
+    unsorted.keys = cuda_mem_to_reg_thread<E::nt, E::vt>(
+      keys_input + tile.begin, tid, tile.count(), shared.keys
+    );
+
+    if(has_values) {
+      unsorted.vals = cuda_mem_to_reg_thread<E::nt, E::vt>(
+        vals_input + tile.begin, tid, tile.count(), shared.vals
+      );
+    }
+
+    // Blocksort.
+    auto sorted = sort_t().block_sort(unsorted, tid, tile.count(), comp, shared.sort);
+
+    // Store the keys and values.
+    cuda_reg_to_mem_thread<E::nt, E::vt>(
+      sorted.keys, tid, tile.count(), keys_blocksort + tile.begin, shared.keys
+    );
+
+    if(has_values) {
+      cuda_reg_to_mem_thread<E::nt, E::vt>(
+        sorted.vals, tid, tile.count(), vals_blocksort + tile.begin, shared.vals
+      );
+    }
+  });
+
+  // merge passes
+
+  if(1 & R) {
+    std::swap(keys_input, keys_output);
+    std::swap(vals_input, vals_output);
+  }
+
+  // number of partitions
+  //unsigned num_partitions = B + 1;
+  //cudaDeviceVector<unsigned> mem(num_partitions);
+  //auto mp_data = mem.data();
+  //std::cout << "num_partitions = " << (B+1)*sizeof(unsigned) << std::endl;
+
+  for(unsigned pass = 0; pass < R; ++pass) {
+
+    unsigned coop = 2 << pass;
+
+    cuda_merge_sort_partitions(
+      p, keys_input, count, coop, E::nv, comp, mp_data
+    );
+
+    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
+
+      __shared__ union {
+        K keys[E::nv + 1];
+        unsigned indices[E::nv];
+      } shared;
+
+      auto tile = cuda_get_tile(bid, E::nv, count);
+
+      // Load the range for this CTA and merge the values into register.
+      auto range = cuda_compute_merge_sort_range(
+        count, bid, coop, E::nv, mp_data[bid + 0], mp_data[bid + 1]
+      );
+
+      auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>(
+        keys_input, keys_input, range, tid, comp, shared.keys
+      );
+
+      // Store merged values back out.
+      cuda_reg_to_mem_thread<E::nt>(
+        merge.keys, tid, tile.count(), keys_output + tile.begin, shared.keys
+      );
+
+      if(has_values) {
+        // Transpose the indices from thread order to strided order.
+        auto indices = cuda_reg_thread_to_strided<E::nt>(
+          merge.indices, tid, shared.indices
+        );
+
+        // Gather the input values and merge into the output values.
+        cuda_transfer_two_streams_strided<E::nt>(
+          vals_input + range.a_begin, range.a_count(),
+          vals_input + range.b_begin, range.b_count(),
+          indices, tid, vals_output + tile.begin
+        );
+      }
+    });
+
+    std::swap(keys_input, keys_output);
+    std::swap(vals_input, vals_output);
+  }
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+namespace tf {
+
+/**
+@brief queries the buffer size in bytes needed to call sort kernels
+       for the given number of elements
+
+@tparam P execution policy type
+@tparam K key type
+@tparam V value type (default tf::cudaEmpty)
+
+@param count number of keys/values to sort
+
+The function is used to allocate a buffer for calling tf::cuda_sort.
+
+*/
+template <typename P, typename K, typename V = cudaEmpty>
+unsigned cuda_sort_buffer_size(unsigned count) {
+
+  using E = std::decay_t<P>;
+
+  const bool has_values = !std::is_same<V, cudaEmpty>::value;
+
+  unsigned B = (count + E::nv - 1) / E::nv;
+  unsigned R = detail::cuda_find_log2(B, true);
+
+  return R ? (count * sizeof(K) + (has_values ? count*sizeof(V) : 0) +
+             (B+1)*sizeof(unsigned)) : 0;
+}
+
+// ----------------------------------------------------------------------------
+// key-value sort
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous key-value sort on a range of items
+
+@tparam P execution policy type
+@tparam K_it key iterator type
+@tparam V_it value iterator type
+@tparam C comparator type
+
+@param p execution policy
+@param k_first iterator to the beginning of the key range
+@param k_last iterator to the end of the key range
+@param v_first iterator to the beginning of the value range
+@param comp binary comparator
+@param buf pointer to the temporary buffer
+
+Sorts key-value elements in <tt>[k_first, k_last)</tt> and
+<tt>[v_first, v_first + (k_last - k_first))</tt> into ascending key order
+using the given comparator @c comp.
+If @c i and @c j are any two valid iterators in <tt>[k_first, k_last)</tt>
+such that @c i precedes @c j, and @c p and @c q are iterators in
+<tt>[v_first, v_first + (k_last - k_first))</tt> corresponding to
+@c i and @c j respectively, then <tt>comp(*j, *i)</tt> evaluates to @c false.
+
+For example, assume:
+  + @c keys are <tt>{1, 4, 2, 8, 5, 7}</tt>
+  + @c values are <tt>{'a', 'b', 'c', 'd', 'e', 'f'}</tt>
+
+After sort:
+  + @c keys are <tt>{1, 2, 4, 5, 7, 8}</tt>
+  + @c values are <tt>{'a', 'c', 'b', 'e', 'f', 'd'}</tt>
+
+*/
+template<typename P, typename K_it, typename V_it, typename C>
+void cuda_sort_by_key(
+  P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf
+) {
+
+  unsigned N = std::distance(k_first, k_last);
+
+  if(N <= 1) {
+    return;
+  }
+
+  detail::merge_sort_loop(p, k_first, v_first, N, comp, buf);
+}
+
+// ----------------------------------------------------------------------------
+// key sort
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous key-only sort on a range of items
+
+@tparam P execution policy type
+@tparam K_it key iterator type
+@tparam C comparator type
+
+@param p execution policy
+@param k_first iterator to the beginning of the key range
+@param k_last iterator to the end of the key range
+@param comp binary comparator
+@param buf pointer to the temporary buffer
+
+This method is equivalent to tf::cuda_sort_by_key without values.
+
+*/
+template<typename P, typename K_it, typename C>
+void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf) {
+  cuda_sort_by_key(p, k_first, k_last, (cudaEmpty*)nullptr, comp, buf);
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp b/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp
new file mode 100644
index 0000000..b1146bd
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp
@@ -0,0 +1,282 @@
+#pragma once
+
+#include "../cudaflow.hpp"
+
+/**
+@file taskflow/cuda/algorithm/transform.hpp
+@brief cuda parallel-transform algorithms include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// transform
+// ----------------------------------------------------------------------------
+
+namespace detail {
+
+/**
+@private
+*/
+template <size_t nt, size_t vt, typename I, typename O, typename C>
+__global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) {
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  cuda_strided_iterate<nt, vt>(
+    [=]__device__(auto, auto j) {
+      auto offset = j + tile.begin;
+      *(output + offset) = op(*(first+offset));
+    }, 
+    tid, 
+    tile.count()
+  );
+}
+
+/**
+@private
+*/
+template <size_t nt, size_t vt, typename I1, typename I2, typename O, typename C>
+__global__ void cuda_transform_kernel(
+  I1 first1, I2 first2, unsigned count, O output, C op
+) {
+  auto tid = threadIdx.x;
+  auto bid = blockIdx.x;
+  auto tile = cuda_get_tile(bid, nt*vt, count);
+  cuda_strided_iterate<nt, vt>(
+    [=]__device__(auto, auto j) {
+      auto offset = j + tile.begin;
+      *(output + offset) = op(*(first1+offset), *(first2+offset));
+    }, 
+    tid, 
+    tile.count()
+  );
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// CUDA standard algorithms: transform
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs asynchronous parallel transforms over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam O output iterator type
+@tparam C unary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param output iterator to the beginning of the output range
+@param op unary operator to apply to transform each item
+
+This method is equivalent to the parallel execution of the following loop on a GPU:
+
+@code{.cpp}
+while (first != last) {
+  *output++ = op(*first++);
+}
+@endcode
+
+*/
+template <typename P, typename I, typename O, typename C>
+void cuda_transform(P&& p, I first, I last, O output, C op) {
+  
+  using E = std::decay_t<P>;
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>
+    <<<E::num_blocks(count), E::nt, 0, p.stream()>>> (
+    first, count, output, op
+  );
+}
+
+/**
+@brief performs asynchronous parallel transforms over two ranges of items
+
+@tparam P execution policy type
+@tparam I1 first input iterator type
+@tparam I2 second input iterator type
+@tparam O output iterator type
+@tparam C binary operator type
+
+@param p execution policy
+@param first1 iterator to the beginning of the first range
+@param last1 iterator to the end of the first range
+@param first2 iterator to the beginning of the second range
+@param output iterator to the beginning of the output range
+@param op binary operator to apply to transform each pair of items
+
+This method is equivalent to the parallel execution of the following loop on a GPU:
+
+@code{.cpp}
+while (first1 != last1) {
+  *output++ = op(*first1++, *first2++);
+}
+@endcode
+*/
+template <typename P, typename I1, typename I2, typename O, typename C>
+void cuda_transform(
+  P&& p, I1 first1, I1 last1, I2 first2, O output, C op
+) {
+  
+  using E = std::decay_t<P>;
+
+  unsigned count = std::distance(first1, last1);
+
+  if(count == 0) {
+    return;
+  }
+
+  detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>
+    <<<E::num_blocks(count), E::nt, 0, p.stream()>>> (
+    first1, first2, count, output, op
+  );
+}
+
+// ----------------------------------------------------------------------------
+// cudaFlow
+// ----------------------------------------------------------------------------
+
+// Function: transform
+template <typename I, typename O, typename C>
+cudaTask cudaFlow::transform(I first, I last, O output, C c) {
+  
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = std::distance(first, last);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  return kernel(
+    E::num_blocks(count), E::nt, 0,
+    detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>,
+    first, count, output, c
+  );
+}
+
+// Function: transform
+template <typename I1, typename I2, typename O, typename C>
+cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) {
+  
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = std::distance(first1, last1);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  return kernel(
+    E::num_blocks(count), E::nt, 0,
+    detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>,
+    first1, first2, count, output, c
+  );
+}
+
+// Function: update transform
+template <typename I, typename O, typename C>
+void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) {
+  
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = std::distance(first, last);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  kernel(task,
+    E::num_blocks(count), E::nt, 0,
+    detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>,
+    first, count, output, c
+  );
+}
+
+// Function: update transform
+template <typename I1, typename I2, typename O, typename C>
+void cudaFlow::transform(
+  cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
+) {
+  using E = cudaDefaultExecutionPolicy;
+
+  unsigned count = std::distance(first1, last1);
+  
+  // TODO:
+  //if(count == 0) {
+  //  return;
+  //}
+
+  kernel(task,
+    E::num_blocks(count), E::nt, 0,
+    detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>,
+    first1, first2, count, output, c
+  );
+}
+
+// ----------------------------------------------------------------------------
+// cudaFlowCapturer
+// ----------------------------------------------------------------------------
+
+// Function: transform
+template <typename I, typename O, typename C>
+cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) {
+  return on([=](cudaStream_t stream) mutable {
+    cudaDefaultExecutionPolicy p(stream);
+    cuda_transform(p, first, last, output, op);
+  });
+}
+
+// Function: transform
+template <typename I1, typename I2, typename O, typename C>
+cudaTask cudaFlowCapturer::transform(
+  I1 first1, I1 last1, I2 first2, O output, C op
+) {
+  return on([=](cudaStream_t stream) mutable {
+    cudaDefaultExecutionPolicy p(stream);
+    cuda_transform(p, first1, last1, first2, output, op);
+  });
+}
+
+// Function: transform
+template <typename I, typename O, typename C>
+void cudaFlowCapturer::transform(
+  cudaTask task, I first, I last, O output, C op
+) {
+  on(task, [=] (cudaStream_t stream) mutable {
+    cudaDefaultExecutionPolicy p(stream);
+    cuda_transform(p, first, last, output, op);
+  });
+}
+
+// Function: transform
+template <typename I1, typename I2, typename O, typename C>
+void cudaFlowCapturer::transform(
+  cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
+) {
+  on(task, [=] (cudaStream_t stream) mutable {
+    cudaDefaultExecutionPolicy p(stream);
+    cuda_transform(p, first1, last1, first2, output, op);
+  });
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp b/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp
new file mode 100644
index 0000000..3b02a7f
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "../cuda_error.hpp"
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// row-wise matrix transpose
+// ----------------------------------------------------------------------------
+//
+template <typename T>
+__global__ void cuda_transpose(
+  const T* d_in,
+  T* d_out,
+  size_t rows,
+  size_t cols
+) {
+  __shared__ T tile[32][32];
+  size_t x = blockIdx.x * 32 + threadIdx.x;
+  size_t y = blockIdx.y * 32 + threadIdx.y;
+
+  for(size_t i = 0; i < 32; i += 8) {
+    if(x < cols && (y + i) < rows) {
+      tile[threadIdx.y + i][threadIdx.x] = d_in[(y + i) * cols + x];
+    }
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;
+  y = blockIdx.x * 32 + threadIdx.y;
+
+  for(size_t i = 0; i < 32; i += 8) {
+    if(x < rows && (y + i) < cols) {
+      d_out[(y + i) * rows + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+
+}  // end of namespace --------------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp b/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp
new file mode 100644
index 0000000..3b5daee
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp
@@ -0,0 +1,724 @@
+#pragma once
+
+#include "cuda_task.hpp"
+#include "cuda_optimizer.hpp"
+
+/**
+@file cuda_capturer.hpp
+@brief %cudaFlow capturer include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// class definition: cudaFlowCapturer
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaFlowCapturer
+
+@brief class to create a %cudaFlow graph using stream capture
+
+The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can
+call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous
+CUDA operations through the given stream.
+The following example creates a CUDA graph that captures two kernel tasks,
+@c task_1 and @c task_2, where @c task_1 runs before @c task_2.
+
+@code{.cpp}
+taskflow.emplace([](tf::cudaFlowCapturer& capturer){
+
+  // capture my_kernel_1 through the given stream managed by the capturer
+  auto task_1 = capturer.on([&](cudaStream_t stream){
+    my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
+  });
+
+  // capture my_kernel_2 through the given stream managed by the capturer
+  auto task_2 = capturer.on([&](cudaStream_t stream){
+    my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
+  });
+
+  task_1.precede(task_2);
+});
+@endcode
+
+Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task)
+created from tf::Taskflow
+and will be run by @em one worker thread in the executor.
+That is, the callable that describes a %cudaFlowCapturer
+will be executed sequentially.
+Inside a %cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run
+in parallel depending on the selected optimization algorithm.
+By default, we use tf::cudaFlowRoundRobinOptimizer to transform a user-level
+graph into a native CUDA graph.
+
+Please refer to @ref GPUTaskingcudaFlowCapturer for details.
+*/
+class cudaFlowCapturer {
+
+  friend class cudaFlow;
+  friend class Executor;
+
+  // created by user
+  struct External {
+    cudaFlowGraph graph;
+  };
+  
+  // created from cudaFlow
+  struct Internal {
+  };
+
+  using handle_t = std::variant<External, Internal>;
+
+  using Optimizer = std::variant<
+    cudaFlowRoundRobinOptimizer,
+    cudaFlowSequentialOptimizer,
+    cudaFlowLinearOptimizer
+  >;
+
+  public:
+
+    /**
+    @brief constrcts a standalone cudaFlowCapturer
+
+    A standalone %cudaFlow capturer does not go through any taskflow and
+    can be run by the caller thread using tf::cudaFlowCapturer::run.
+    */
+    cudaFlowCapturer() = default;
+
+    /**
+    @brief destructs the cudaFlowCapturer
+    */
+    ~cudaFlowCapturer() = default;
+    
+    /**
+    @brief default move constructor
+    */
+    cudaFlowCapturer(cudaFlowCapturer&&) = default;
+    
+    /**
+    @brief default move assignment operator
+    */
+    cudaFlowCapturer& operator = (cudaFlowCapturer&&) = default;
+
+    /**
+    @brief queries the emptiness of the graph
+    */
+    bool empty() const;
+
+    /**
+    @brief queries the number of tasks
+    */
+    size_t num_tasks() const;
+
+    /**
+    @brief clear this %cudaFlow capturer
+    */
+    void clear();
+
+    /**
+    @brief dumps the %cudaFlow graph into a DOT format through an
+           output stream
+    */
+    void dump(std::ostream& os) const;
+
+    /**
+    @brief dumps the native captured graph into a DOT format through 
+           an output stream
+    */
+    void dump_native_graph(std::ostream& os) const;
+
+    // ------------------------------------------------------------------------
+    // basic methods
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief captures a sequential CUDA operations from the given callable
+
+    @tparam C callable type constructible with @c std::function<void(cudaStream_t)>
+    @param callable a callable to capture CUDA operations with the stream
+
+    This methods applies a stream created by the flow to capture
+    a sequence of CUDA operations defined in the callable.
+    */
+    template <typename C, std::enable_if_t<
+      std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr
+    >
+    cudaTask on(C&& callable);
+
+    /**
+    @brief updates a capture task to another sequential CUDA operations
+
+    The method is similar to cudaFlowCapturer::on but operates
+    on an existing task.
+    */
+    template <typename C, std::enable_if_t<
+      std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr
+    >
+    void on(cudaTask task, C&& callable);
+
+    /**
+    @brief captures a no-operation task
+
+    @return a tf::cudaTask handle
+
+    An empty node performs no operation during execution,
+    but can be used for transitive ordering.
+    For example, a phased execution graph with 2 groups of @c n nodes
+    with a barrier between them can be represented using an empty node
+    and @c 2*n dependency edges,
+    rather than no empty node and @c n^2 dependency edges.
+    */
+    cudaTask noop();
+
+    /**
+    @brief updates a task to a no-operation task
+
+    The method is similar to tf::cudaFlowCapturer::noop but
+    operates on an existing task.
+    */
+    void noop(cudaTask task);
+
+    /**
+    @brief copies data between host and device asynchronously through a stream
+
+    @param dst destination memory address
+    @param src source memory address
+    @param count size in bytes to copy
+
+    The method captures a @c cudaMemcpyAsync operation through an
+    internal stream.
+    */
+    cudaTask memcpy(void* dst, const void* src, size_t count);
+
+    /**
+    @brief updates a capture task to a memcpy operation
+
+    The method is similar to cudaFlowCapturer::memcpy but operates on an
+    existing task.
+    */
+    void memcpy(cudaTask task, void* dst, const void* src, size_t count);
+
+    /**
+    @brief captures a copy task of typed data
+
+    @tparam T element type (non-void)
+
+    @param tgt pointer to the target memory block
+    @param src pointer to the source memory block
+    @param num number of elements to copy
+
+    @return cudaTask handle
+
+    A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location
+    to a target location. Direction can be arbitrary among CPUs and GPUs.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    cudaTask copy(T* tgt, const T* src, size_t num);
+
+    /**
+    @brief updates a capture task to a copy operation
+
+    The method is similar to cudaFlowCapturer::copy but operates on
+    an existing task.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    void copy(cudaTask task, T* tgt, const T* src, size_t num);
+
+    /**
+    @brief initializes or sets GPU memory to the given value byte by byte
+
+    @param ptr pointer to GPU mempry
+    @param v value to set for each byte of the specified memory
+    @param n size in bytes to set
+
+    The method captures a @c cudaMemsetAsync operation through an
+    internal stream to fill the first @c count bytes of the memory area
+    pointed to by @c devPtr with the constant byte value @c value.
+    */
+    cudaTask memset(void* ptr, int v, size_t n);
+
+    /**
+    @brief updates a capture task to a memset operation
+
+    The method is similar to cudaFlowCapturer::memset but operates on
+    an existing task.
+    */
+    void memset(cudaTask task, void* ptr, int value, size_t n);
+
+    /**
+    @brief captures a kernel
+
+    @tparam F kernel function type
+    @tparam ArgsT kernel function parameters type
+
+    @param g configured grid
+    @param b configured block
+    @param s configured shared memory size in bytes
+    @param f kernel function
+    @param args arguments to forward to the kernel function by copy
+
+    @return cudaTask handle
+    */
+    template <typename F, typename... ArgsT>
+    cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args);
+
+    /**
+    @brief updates a capture task to a kernel operation
+
+    The method is similar to cudaFlowCapturer::kernel but operates on
+    an existing task.
+    */
+    template <typename F, typename... ArgsT>
+    void kernel(
+      cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
+    );
+
+    // ------------------------------------------------------------------------
+    // generic algorithms
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief capturers a kernel to runs the given callable with only one thread
+
+    @tparam C callable type
+
+    @param c callable to run by a single kernel thread
+    */
+    template <typename C>
+    cudaTask single_task(C c);
+
+    /**
+    @brief updates a capture task to a single-threaded kernel
+
+    This method is similar to cudaFlowCapturer::single_task but operates
+    on an existing task.
+    */
+    template <typename C>
+    void single_task(cudaTask task, C c);
+
+    /**
+    @brief captures a kernel that applies a callable to each dereferenced element
+           of the data array
+
+    @tparam I iterator type
+    @tparam C callable type
+
+    @param first iterator to the beginning
+    @param last iterator to the end
+    @param callable a callable object to apply to the dereferenced iterator
+
+    @return cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    for(auto itr = first; itr != last; i++) {
+      callable(*itr);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    cudaTask for_each(I first, I last, C callable);
+
+    /**
+    @brief updates a capture task to a for-each kernel task
+
+    This method is similar to cudaFlowCapturer::for_each but operates
+    on an existing task.
+    */
+    template <typename I, typename C>
+    void for_each(cudaTask task, I first, I last, C callable);
+
+    /**
+    @brief captures a kernel that applies a callable to each index in the range
+           with the step size
+
+    @tparam I index type
+    @tparam C callable type
+
+    @param first beginning index
+    @param last last index
+    @param step step size
+    @param callable the callable to apply to each element in the data array
+
+    @return cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    // step is positive [first, last)
+    for(auto i=first; i<last; i+=step) {
+      callable(i);
+    }
+
+    // step is negative [first, last)
+    for(auto i=first; i>last; i+=step) {
+      callable(i);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    cudaTask for_each_index(I first, I last, I step, C callable);
+
+    /**
+    @brief updates a capture task to a for-each-index kernel task
+
+    This method is similar to cudaFlowCapturer::for_each_index but operates
+    on an existing task.
+    */
+    template <typename I, typename C>
+    void for_each_index(
+      cudaTask task, I first, I last, I step, C callable
+    );
+
+    /**
+    @brief captures a kernel that transforms an input range to an output range
+
+    @tparam I input iterator type
+    @tparam O output iterator type
+    @tparam C unary operator type
+
+    @param first iterator to the beginning of the input range
+    @param last iterator to the end of the input range
+    @param output iterator to the beginning of the output range
+    @param op unary operator to apply to transform each item in the range
+
+    @return cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    while (first != last) {
+      *output++ = op(*first++);
+    }
+    @endcode
+    */
+    template <typename I, typename O, typename C>
+    cudaTask transform(I first, I last, O output, C op);
+
+    /**
+    @brief updates a capture task to a transform kernel task
+
+    This method is similar to cudaFlowCapturer::transform but operates
+    on an existing task.
+    */
+    template <typename I, typename O, typename C>
+    void transform(cudaTask task, I first, I last, O output, C op);
+
+    /**
+    @brief captures a kernel that transforms two input ranges to an output range
+
+    @tparam I1 first input iterator type
+    @tparam I2 second input iterator type
+    @tparam O output iterator type
+    @tparam C unary operator type
+
+    @param first1 iterator to the beginning of the input range
+    @param last1 iterator to the end of the input range
+    @param first2 iterato
+    @param output iterator to the beginning of the output range
+    @param op binary operator to apply to transform each pair of items in the
+              two input ranges
+
+    @return cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    while (first1 != last1) {
+      *output++ = op(*first1++, *first2++);
+    }
+    @endcode
+    */
+    template <typename I1, typename I2, typename O, typename C>
+    cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
+
+    /**
+    @brief updates a capture task to a transform kernel task
+
+    This method is similar to cudaFlowCapturer::transform but operates
+    on an existing task.
+    */
+    template <typename I1, typename I2, typename O, typename C>
+    void transform(
+      cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
+    );
+
+    // ------------------------------------------------------------------------
+    // Capturing methods
+    // ------------------------------------------------------------------------
+    
+    /**
+    @brief selects a different optimization algorithm
+
+    @tparam OPT optimizer type
+    @tparam ArgsT arguments types
+
+    @param args arguments to forward to construct the optimizer
+
+    @return a reference to the optimizer
+
+    We currently supports the following optimization algorithms to capture
+    a user-described %cudaFlow:
+      + tf::cudaFlowSequentialOptimizer
+      + tf::cudaFlowRoundRobinOptimizer
+      + tf::cudaFlowLinearOptimizer
+
+    By default, tf::cudaFlowCapturer uses the round-robin optimization
+    algorithm with four streams to transform a user-level graph into
+    a native CUDA graph.
+    */
+    template <typename OPT, typename... ArgsT>
+    OPT& make_optimizer(ArgsT&&... args);
+    
+    /**
+    @brief captures the cudaFlow and turns it into a CUDA Graph
+    */
+    cudaGraph_t capture();
+
+    // ------------------------------------------------------------------------
+    // offload methods
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief offloads the %cudaFlowCapturer onto a GPU asynchronously via a stream
+
+    @param stream stream for performing this operation
+
+    Offloads the present %cudaFlowCapturer onto a GPU asynchronously via
+    the given stream.
+
+    An offloaded %cudaFlowCapturer forces the underlying graph to be instantiated.
+    After the instantiation, you should not modify the graph topology
+    but update node parameters.
+    */
+    void run(cudaStream_t stream);
+    
+    /**
+    @brief acquires a reference to the underlying CUDA graph
+    */
+    cudaGraph_t native_graph();
+
+    /**
+    @brief acquires a reference to the underlying CUDA graph executable
+    */
+    cudaGraphExec_t native_executable();
+
+  private:
+
+    cudaFlowGraph _cfg;
+
+    Optimizer _optimizer;
+
+    cudaGraphExec _exe {nullptr};
+};
+
+// Function: empty
+inline bool cudaFlowCapturer::empty() const {
+  return _cfg.empty();
+}
+
+// Function: num_tasks
+inline size_t cudaFlowCapturer::num_tasks() const {
+  return _cfg._nodes.size();
+}
+
+// Procedure: clear
+inline void cudaFlowCapturer::clear() {
+  _exe.clear();
+  _cfg.clear();
+}
+
+// Procedure: dump
+inline void cudaFlowCapturer::dump(std::ostream& os) const {
+  _cfg.dump(os, nullptr, "");
+}
+
+// Procedure: dump_native_graph
+inline void cudaFlowCapturer::dump_native_graph(std::ostream& os) const {
+  cuda_dump_graph(os, _cfg._native_handle);
+}
+
+// Function: capture
+template <typename C, std::enable_if_t<
+  std::is_invocable_r_v<void, C, cudaStream_t>, void>*
+>
+cudaTask cudaFlowCapturer::on(C&& callable) {
+  auto node = _cfg.emplace_back(_cfg,
+    std::in_place_type_t<cudaFlowNode::Capture>{}, std::forward<C>(callable)
+  );
+  return cudaTask(node);
+}
+
+// Function: noop
+inline cudaTask cudaFlowCapturer::noop() {
+  return on([](cudaStream_t){});
+}
+
+// Function: noop
+inline void cudaFlowCapturer::noop(cudaTask task) {
+  on(task, [](cudaStream_t){});
+}
+
+// Function: memcpy
+inline cudaTask cudaFlowCapturer::memcpy(
+  void* dst, const void* src, size_t count
+) {
+  return on([dst, src, count] (cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
+      "failed to capture memcpy"
+    );
+  });
+}
+
+// Function: copy
+template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
+cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) {
+  return on([tgt, src, num] (cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
+      "failed to capture copy"
+    );
+  });
+}
+
+// Function: memset
+inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) {
+  return on([ptr, v, n] (cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
+    );
+  });
+}
+
+// Function: kernel
+template <typename F, typename... ArgsT>
+cudaTask cudaFlowCapturer::kernel(
+  dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
+) {
+  return on([g, b, s, f, args...] (cudaStream_t stream) mutable {
+    f<<<g, b, s, stream>>>(args...);
+  });
+}
+
+// Function: capture
+inline cudaGraph_t cudaFlowCapturer::capture() {
+  return std::visit(
+    [this](auto&& opt){ return opt._optimize(_cfg); }, _optimizer
+  );
+}
+
+// Procedure: run
+inline void cudaFlowCapturer::run(cudaStream_t stream) {
+
+  // If the topology got changed, we need to destroy the executable
+  // and create a new one
+  if(_cfg._state & cudaFlowGraph::CHANGED) {
+    _cfg._native_handle.reset(capture());
+    _exe.instantiate(_cfg._native_handle);
+  }
+  // if the graph is just updated (i.e., topology does not change),
+  // we can skip part of the optimization and just update the executable
+  // with the new captured graph
+  else if(_cfg._state & cudaFlowGraph::UPDATED) {
+    // TODO: skip part of the optimization (e.g., levelization)
+    _cfg._native_handle.reset(capture());
+    if(_exe.update(_cfg._native_handle) != cudaGraphExecUpdateSuccess) {
+      _exe.instantiate(_cfg._native_handle);
+    }
+  }
+
+  // run the executable (should exist)
+  _exe.launch(stream);
+
+  _cfg._state = cudaFlowGraph::OFFLOADED;
+}
+
+// Function: native_graph
+inline cudaGraph_t cudaFlowCapturer::native_graph() {
+  return _cfg._native_handle;
+}
+
+// Function: native_executable
+inline cudaGraphExec_t cudaFlowCapturer::native_executable() {
+  return _exe;
+}
+
+// Function: on
+template <typename C, std::enable_if_t<
+  std::is_invocable_r_v<void, C, cudaStream_t>, void>*
+>
+void cudaFlowCapturer::on(cudaTask task, C&& callable) {
+
+  if(task.type() != cudaTaskType::CAPTURE) {
+    TF_THROW("invalid cudaTask type (must be CAPTURE)");
+  }
+
+  _cfg._state |= cudaFlowGraph::UPDATED;
+
+  std::get_if<cudaFlowNode::Capture>(&task._node->_handle)->work =
+    std::forward<C>(callable);
+}
+
+// Function: memcpy
+inline void cudaFlowCapturer::memcpy(
+  cudaTask task, void* dst, const void* src, size_t count
+) {
+  on(task, [dst, src, count](cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
+      "failed to capture memcpy"
+    );
+  });
+}
+
+// Function: copy
+template <typename T,
+  std::enable_if_t<!std::is_same_v<T, void>, void>*
+>
+void cudaFlowCapturer::copy(
+  cudaTask task, T* tgt, const T* src, size_t num
+) {
+  on(task, [tgt, src, num] (cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
+      "failed to capture copy"
+    );
+  });
+}
+
+// Function: memset
+inline void cudaFlowCapturer::memset(
+  cudaTask task, void* ptr, int v, size_t n
+) {
+  on(task, [ptr, v, n] (cudaStream_t stream) mutable {
+    TF_CHECK_CUDA(
+      cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
+    );
+  });
+}
+
+// Function: kernel
+template <typename F, typename... ArgsT>
+void cudaFlowCapturer::kernel(
+  cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
+) {
+  on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable {
+    f<<<g, b, s, stream>>>(args...);
+  });
+}
+
+// Function: make_optimizer
+template <typename OPT, typename ...ArgsT>
+OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) {
+  return _optimizer.emplace<OPT>(std::forward<ArgsT>(args)...);
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_device.hpp b/myxpcs/include/taskflow_/cuda/cuda_device.hpp
new file mode 100644
index 0000000..016b2a6
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_device.hpp
@@ -0,0 +1,342 @@
+#pragma once
+
+#include "cuda_error.hpp"
+
+/**
+@file cuda_device.hpp
+@brief CUDA device utilities include file
+*/
+
+namespace tf {
+
+/**
+@brief queries the number of available devices
+*/
+inline size_t cuda_get_num_devices() {
+	int N = 0;
+  TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
+	return static_cast<size_t>(N);
+}
+
+/**
+@brief gets the current device associated with the caller thread
+*/
+inline int cuda_get_device() {
+  int id;
+  TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
+	return id;
+}
+
+/**
+@brief switches to a given device context
+*/
+inline void cuda_set_device(int id) {
+  TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
+}
+
+/**
+@brief obtains the device property
+*/
+inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
+  TF_CHECK_CUDA(
+    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
+  );
+}
+
+/**
+@brief obtains the device property
+*/
+inline cudaDeviceProp cuda_get_device_property(int i) {
+  cudaDeviceProp p;
+  TF_CHECK_CUDA(
+    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
+  );
+  return p;
+}
+
+/**
+@brief dumps the device property
+*/
+inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {
+
+  os << "Major revision number:         " << p.major << '\n'
+     << "Minor revision number:         " << p.minor << '\n'
+     << "Name:                          " << p.name  << '\n'
+     << "Total global memory:           " << p.totalGlobalMem << '\n'
+     << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
+     << "Total registers per block:     " << p.regsPerBlock << '\n'
+     << "Warp size:                     " << p.warpSize << '\n'
+     << "Maximum memory pitch:          " << p.memPitch << '\n'
+     << "Maximum threads per block:     " << p.maxThreadsPerBlock << '\n';
+
+  os << "Maximum dimension of block:    ";
+  for (int i = 0; i < 3; ++i) {
+    if(i) os << 'x';
+    os << p.maxThreadsDim[i];
+  }
+  os << '\n';
+
+  os << "Maximum dimenstion of grid:    ";
+  for (int i = 0; i < 3; ++i) {
+    if(i) os << 'x';
+    os << p.maxGridSize[i];;
+  }
+  os << '\n';
+
+  os << "Clock rate:                    " << p.clockRate << '\n'
+     << "Total constant memory:         " << p.totalConstMem << '\n'
+     << "Texture alignment:             " << p.textureAlignment << '\n'
+     << "Concurrent copy and execution: " << p.deviceOverlap << '\n'
+     << "Number of multiprocessors:     " << p.multiProcessorCount << '\n'
+     << "Kernel execution timeout:      " << p.kernelExecTimeoutEnabled << '\n'
+     << "GPU sharing Host Memory:       " << p.integrated << '\n'
+     << "Host page-locked mem mapping:  " << p.canMapHostMemory << '\n'
+     << "Alignment for Surfaces:        " << p.surfaceAlignment << '\n'
+     << "Device has ECC support:        " << p.ECCEnabled << '\n'
+     << "Unified Addressing (UVA):      " << p.unifiedAddressing << '\n';
+}
+
+/**
+@brief queries the maximum threads per block on a device
+*/
+inline size_t cuda_get_device_max_threads_per_block(int d) {
+  int threads = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
+    "failed to query the maximum threads per block on device ", d
+  )
+  return threads;
+}
+
+/**
+@brief queries the maximum x-dimension per block on a device
+*/
+inline size_t cuda_get_device_max_x_dim_per_block(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
+    "failed to query the maximum x-dimension per block on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum y-dimension per block on a device
+*/
+inline size_t cuda_get_device_max_y_dim_per_block(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
+    "failed to query the maximum y-dimension per block on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum z-dimension per block on a device
+*/
+inline size_t cuda_get_device_max_z_dim_per_block(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
+    "failed to query the maximum z-dimension per block on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum x-dimension per grid on a device
+*/
+inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
+    "failed to query the maximum x-dimension per grid on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum y-dimension per grid on a device
+*/
+inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
+    "failed to query the maximum y-dimension per grid on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum z-dimension per grid on a device
+*/
+inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
+  int dim = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
+    "failed to query the maximum z-dimension per grid on device ", d
+  )
+  return dim;
+}
+
+/**
+@brief queries the maximum shared memory size in bytes per block on a device
+*/
+inline size_t cuda_get_device_max_shm_per_block(int d) {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
+    "failed to query the maximum shared memory per block on device ", d
+  )
+  return num;
+}
+
+/**
+@brief queries the warp size on a device
+*/
+inline size_t cuda_get_device_warp_size(int d) {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
+    "failed to query the warp size per block on device ", d
+  )
+  return num;
+}
+
+/**
+@brief queries the major number of compute capability of a device
+*/
+inline int cuda_get_device_compute_capability_major(int d) {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
+    "failed to query the major number of compute capability of device ", d
+  )
+  return num;
+}
+
+/**
+@brief queries the minor number of compute capability of a device
+*/
+inline int cuda_get_device_compute_capability_minor(int d) {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
+    "failed to query the minor number of compute capability of device ", d
+  )
+  return num;
+}
+
+/**
+@brief queries if the device supports unified addressing
+*/
+inline bool cuda_get_device_unified_addressing(int d) {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
+    "failed to query unified addressing status on device ", d
+  )
+  return num;
+}
+
+// ----------------------------------------------------------------------------
+// CUDA Version
+// ----------------------------------------------------------------------------
+
+/**
+@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
+*/
+inline int cuda_get_driver_version() {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaDriverGetVersion(&num),
+    "failed to query the latest cuda version supported by the driver"
+  );
+  return num;
+}
+
+/**
+@brief queries the CUDA Runtime version (1000 * major + 10 * minor)
+*/
+inline int cuda_get_runtime_version() {
+  int num = 0;
+  TF_CHECK_CUDA(
+    cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
+  );
+  return num;
+}
+
+// ----------------------------------------------------------------------------
+// cudaScopedDevice
+// ----------------------------------------------------------------------------
+
+/** @class cudaScopedDevice
+
+@brief class to create an RAII-styled context switch
+
+Sample usage:
+
+@code{.cpp}
+{
+  tf::cudaScopedDevice device(1);  // switch to the device context 1
+
+  // create a stream under device context 1
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+}  // leaving the scope and goes back to the previous device context
+@endcode
+
+%cudaScopedDevice is neither movable nor copyable.
+*/
+class cudaScopedDevice {
+
+  public:
+
+    /**
+    @brief constructs a RAII-styled device switcher
+
+    @param device device context to scope in the guard
+    */
+    explicit cudaScopedDevice(int device);
+
+    /**
+    @brief destructs the guard and switches back to the previous device context
+    */
+    ~cudaScopedDevice();
+
+  private:
+
+    cudaScopedDevice() = delete;
+    cudaScopedDevice(const cudaScopedDevice&) = delete;
+    cudaScopedDevice(cudaScopedDevice&&) = delete;
+
+    int _p;
+};
+
+// Constructor
+inline cudaScopedDevice::cudaScopedDevice(int dev) {
+  TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
+  if(_p == dev) {
+    _p = -1;
+  }
+  else {
+    TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
+  }
+}
+
+// Destructor
+inline cudaScopedDevice::~cudaScopedDevice() {
+  if(_p != -1) {
+    cudaSetDevice(_p);
+    //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
+  }
+}
+
+}  // end of namespace cuda ---------------------------------------------------
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_error.hpp b/myxpcs/include/taskflow_/cuda/cuda_error.hpp
new file mode 100644
index 0000000..c38e132
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_error.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+#include <exception>
+
+#include "../utility/stream.hpp"
+
+#define TF_CUDA_EXPAND( x ) x
+#define TF_CUDA_REMOVE_FIRST_HELPER(N, ...) __VA_ARGS__
+#define TF_CUDA_REMOVE_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_REMOVE_FIRST_HELPER(__VA_ARGS__))
+#define TF_CUDA_GET_FIRST_HELPER(N, ...) N
+#define TF_CUDA_GET_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_GET_FIRST_HELPER(__VA_ARGS__))
+
+#define TF_CHECK_CUDA(...)                                       \
+if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) {              \
+  std::ostringstream oss;                                        \
+  auto __ev__ = TF_CUDA_GET_FIRST(__VA_ARGS__);                  \
+  oss << "[" << __FILE__ << ":" << __LINE__ << "] "              \
+      << (cudaGetErrorString(__ev__)) << " ("                    \
+      << (cudaGetErrorName(__ev__)) << ") - ";                   \
+  tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__));        \
+  throw std::runtime_error(oss.str());                           \
+}
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp b/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp
new file mode 100644
index 0000000..ae90d98
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp
@@ -0,0 +1,155 @@
+#pragma once
+
+#include "cuda_error.hpp"
+
+/**
+@file cuda_execution_policy.hpp
+@brief CUDA execution policy include file
+*/
+
+namespace tf {
+
+/**
+@class cudaExecutionPolicy
+
+@brief class to define execution policy for CUDA standard algorithms
+
+@tparam NT number of threads per block
+@tparam VT number of work units per thread
+
+Execution policy configures the kernel execution parameters in CUDA algorithms.
+The first template argument, @c NT, the number of threads per block should
+always be a power-of-two number.
+The second template argument, @c VT, the number of work units per thread
+is recommended to be an odd number to avoid bank conflict.
+
+Details can be referred to @ref CUDASTDExecutionPolicy.
+*/
+template<unsigned NT, unsigned VT>
+class cudaExecutionPolicy {
+
+  static_assert(is_pow2(NT), "max # threads per block must be a power of two");
+
+  public:
+
+  /** @brief static constant for getting the number of threads per block */
+  const static unsigned nt = NT;
+
+  /** @brief static constant for getting the number of work units per thread */
+  const static unsigned vt = VT;
+
+  /** @brief static constant for getting the number of elements to process per block */
+  const static unsigned nv = NT*VT;
+
+  /**
+  @brief constructs an execution policy object with default stream
+   */
+  cudaExecutionPolicy() = default;
+
+  /**
+  @brief constructs an execution policy object with the given stream
+   */
+  explicit cudaExecutionPolicy(cudaStream_t s) : _stream{s} {}
+  
+  /**
+  @brief queries the associated stream
+   */
+  cudaStream_t stream() noexcept { return _stream; };
+
+  /**
+  @brief assigns a stream
+   */
+  void stream(cudaStream_t stream) noexcept { _stream = stream; }
+  
+  /**
+  @brief queries the number of blocks to accommodate N elements
+  */
+  static unsigned num_blocks(unsigned N) { return (N + nv - 1) / nv; } 
+  
+  // --------------------------------------------------------------------------
+  // Buffer Sizes for Standard Algorithms
+  // --------------------------------------------------------------------------
+  
+  /**
+  @brief queries the buffer size in bytes needed to call reduce kernels
+  
+  @tparam T value type
+  
+  @param count number of elements to reduce
+  
+  The function is used to allocate a buffer for calling tf::cuda_reduce,
+  tf::cuda_uninitialized_reduce, tf::cuda_transform_reduce, and
+  tf::cuda_uninitialized_transform_reduce.
+  */
+  template <typename T>
+  static unsigned reduce_bufsz(unsigned count);
+
+  /**
+  @brief queries the buffer size in bytes needed to call tf::cuda_min_element
+  
+  @tparam T value type
+  
+  @param count number of elements to search
+  
+  The function is used to decide the buffer size in bytes for calling
+  tf::cuda_min_element.
+  */
+  template <typename T>
+  static unsigned min_element_bufsz(unsigned count);
+
+  /**
+  @brief queries the buffer size in bytes needed to call tf::cuda_max_element
+  
+  @tparam T value type
+  
+  @param count number of elements to search
+  
+  The function is used to decide the buffer size in bytes for calling
+  tf::cuda_max_element.
+  */
+  template <typename T>
+  static unsigned max_element_bufsz(unsigned count);
+
+  /**
+  @brief queries the buffer size in bytes needed to call scan kernels
+  
+  @tparam T value type
+  
+  @param count number of elements to scan
+  
+  The function is used to allocate a buffer for calling
+  tf::cuda_inclusive_scan, tf::cuda_exclusive_scan,
+  tf::cuda_transform_inclusive_scan, and tf::cuda_transform_exclusive_scan.
+  */
+  template <typename T>
+  static unsigned scan_bufsz(unsigned count);
+  
+  /**
+  @brief queries the buffer size in bytes needed for CUDA merge algorithms
+
+  @param a_count number of elements in the first vector to merge
+  @param b_count number of elements in the second vector to merge
+
+  The buffer size of merge algorithm does not depend on the data type.
+  The buffer is purely used only for storing temporary indices 
+  (of type @c unsigned) required during the merge process.
+
+  The function is used to allocate a buffer for calling
+  tf::cuda_merge and tf::cuda_merge_by_key.
+  */
+  inline static unsigned merge_bufsz(unsigned a_count, unsigned b_count);
+
+  private:
+
+  cudaStream_t _stream {0};
+};
+
+/**
+@brief default execution policy
+ */
+using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>;
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_graph.hpp b/myxpcs/include/taskflow_/cuda/cuda_graph.hpp
new file mode 100644
index 0000000..a326aed
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_graph.hpp
@@ -0,0 +1,805 @@
+#pragma once
+
+#include "cuda_memory.hpp"
+#include "cuda_stream.hpp"
+#include "cuda_meta.hpp"
+
+#include "../utility/traits.hpp"
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// cudaGraph_t routines
+// ----------------------------------------------------------------------------
+
+/**
+@brief gets the memcpy node parameter of a copy task
+*/
+template <typename T,
+  std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+>
+cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {
+
+  using U = std::decay_t<T>;
+
+  cudaMemcpy3DParms p;
+
+  p.srcArray = nullptr;
+  p.srcPos = ::make_cudaPos(0, 0, 0);
+  p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
+  p.dstArray = nullptr;
+  p.dstPos = ::make_cudaPos(0, 0, 0);
+  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
+  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
+  p.kind = cudaMemcpyDefault;
+
+  return p;
+}
+
+/**
+@brief gets the memcpy node parameter of a memcpy task (untyped)
+*/
+inline cudaMemcpy3DParms cuda_get_memcpy_parms(
+  void* tgt, const void* src, size_t bytes
+)  {
+
+  // Parameters in cudaPitchedPtr
+  // d   - Pointer to allocated memory
+  // p   - Pitch of allocated memory in bytes
+  // xsz - Logical width of allocation in elements
+  // ysz - Logical height of allocation in elements
+  cudaMemcpy3DParms p;
+  p.srcArray = nullptr;
+  p.srcPos = ::make_cudaPos(0, 0, 0);
+  p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
+  p.dstArray = nullptr;
+  p.dstPos = ::make_cudaPos(0, 0, 0);
+  p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
+  p.extent = ::make_cudaExtent(bytes, 1, 1);
+  p.kind = cudaMemcpyDefault;
+
+  return p;
+}
+
+/**
+@brief gets the memset node parameter of a memcpy task (untyped)
+*/
+inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {
+
+  cudaMemsetParams p;
+  p.dst = dst;
+  p.value = ch;
+  p.pitch = 0;
+  //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
+  //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
+  p.elementSize = 1;  // either 1, 2, or 4
+  p.width = count;
+  p.height = 1;
+
+  return p;
+}
+
+/**
+@brief gets the memset node parameter of a fill task (typed)
+*/
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+>
+cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {
+
+  cudaMemsetParams p;
+  p.dst = dst;
+
+  // perform bit-wise copy
+  p.value = 0;  // crucial
+  static_assert(sizeof(T) <= sizeof(p.value), "internal error");
+  std::memcpy(&p.value, &value, sizeof(T));
+
+  p.pitch = 0;
+  p.elementSize = sizeof(T);  // either 1, 2, or 4
+  p.width = count;
+  p.height = 1;
+
+  return p;
+}
+
+/**
+@brief gets the memset node parameter of a zero task (typed)
+*/
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+>
+cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {
+
+  cudaMemsetParams p;
+  p.dst = dst;
+  p.value = 0;
+  p.pitch = 0;
+  p.elementSize = sizeof(T);  // either 1, 2, or 4
+  p.width = count;
+  p.height = 1;
+
+  return p;
+}
+
+/**
+@brief queries the number of root nodes in a native CUDA graph
+*/
+inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) {
+  size_t num_nodes;
+  TF_CHECK_CUDA(
+    cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
+    "failed to get native graph root nodes"
+  );
+  return num_nodes;
+}
+
+/**
+@brief queries the number of nodes in a native CUDA graph
+*/
+inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) {
+  size_t num_nodes;
+  TF_CHECK_CUDA(
+    cudaGraphGetNodes(graph, nullptr, &num_nodes),
+    "failed to get native graph nodes"
+  );
+  return num_nodes;
+}
+
+/**
+@brief queries the number of edges in a native CUDA graph
+*/
+inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) {
+  size_t num_edges;
+  TF_CHECK_CUDA(
+    cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges),
+    "failed to get native graph edges"
+  );
+  return num_edges;
+}
+
+/**
+@brief acquires the nodes in a native CUDA graph
+*/
+inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) {
+  size_t num_nodes = cuda_graph_get_num_nodes(graph);
+  std::vector<cudaGraphNode_t> nodes(num_nodes);
+  TF_CHECK_CUDA(
+    cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
+    "failed to get native graph nodes"
+  );
+  return nodes;
+}
+
+/**
+@brief acquires the root nodes in a native CUDA graph
+*/
+inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) {
+  size_t num_nodes = cuda_graph_get_num_root_nodes(graph);
+  std::vector<cudaGraphNode_t> nodes(num_nodes);
+  TF_CHECK_CUDA(
+    cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
+    "failed to get native graph nodes"
+  );
+  return nodes;
+}
+
+/**
+@brief acquires the edges in a native CUDA graph
+*/
+inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
+cuda_graph_get_edges(cudaGraph_t graph) {
+  size_t num_edges = cuda_graph_get_num_edges(graph);
+  std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
+  TF_CHECK_CUDA(
+    cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges),
+    "failed to get native graph edges"
+  );
+  std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
+  for(size_t i=0; i<num_edges; i++) {
+    edges[i] = std::make_pair(froms[i], tos[i]);
+  }
+  return edges;
+}
+
+/**
+@brief queries the type of a native CUDA graph node
+
+valid type values are:
+  + cudaGraphNodeTypeKernel      = 0x00
+  + cudaGraphNodeTypeMemcpy      = 0x01
+  + cudaGraphNodeTypeMemset      = 0x02
+  + cudaGraphNodeTypeHost        = 0x03
+  + cudaGraphNodeTypeGraph       = 0x04
+  + cudaGraphNodeTypeEmpty       = 0x05
+  + cudaGraphNodeTypeWaitEvent   = 0x06
+  + cudaGraphNodeTypeEventRecord = 0x07
+*/
+inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
+  cudaGraphNodeType type;
+  TF_CHECK_CUDA(
+    cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
+  );
+  return type;
+}
+
+/**
+@brief convert the type of a native CUDA graph node to a readable string
+*/
+inline const char* cuda_graph_node_type_to_string(cudaGraphNodeType type) {
+  switch(type) {
+    case cudaGraphNodeTypeKernel      : return "kernel";
+    case cudaGraphNodeTypeMemcpy      : return "memcpy";
+    case cudaGraphNodeTypeMemset      : return "memset";
+    case cudaGraphNodeTypeHost        : return "host";
+    case cudaGraphNodeTypeGraph       : return "graph";
+    case cudaGraphNodeTypeEmpty       : return "empty";
+    case cudaGraphNodeTypeWaitEvent   : return "event_wait";
+    case cudaGraphNodeTypeEventRecord : return "event_record";
+    default                           : return "undefined";
+  }
+}
+
+/**
+@brief dumps a native CUDA graph and all associated child graphs to a DOT format
+
+@tparam T output stream target
+@param os target output stream
+@param graph native CUDA graph
+*/
+template <typename T>
+void cuda_dump_graph(T& os, cudaGraph_t g) {
+
+  os << "digraph cudaGraph {\n";
+
+  std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
+  stack.push(std::make_tuple(g, nullptr, 1));
+
+  int pl = 0;
+
+  while(stack.empty() == false) {
+
+    auto [graph, parent, l] = stack.top();
+    stack.pop();
+
+    for(int i=0; i<pl-l+1; i++) {
+      os << "}\n";
+    }
+
+    os << "subgraph cluster_p" << graph << " {\n"
+       << "label=\"cudaGraph-L" << l << "\";\n"
+       << "color=\"purple\";\n";
+
+    auto nodes = cuda_graph_get_nodes(graph);
+    auto edges = cuda_graph_get_edges(graph);
+
+    for(auto& [from, to] : edges) {
+      os << 'p' << from << " -> " << 'p' << to << ";\n";
+    }
+
+    for(auto& node : nodes) {
+      auto type = cuda_get_graph_node_type(node);
+      if(type == cudaGraphNodeTypeGraph) {
+
+        cudaGraph_t child_graph;
+        TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), "");
+        stack.push(std::make_tuple(child_graph, node, l+1));
+
+        os << 'p' << node << "["
+           << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
+           << "label=\"cudaGraph-L" << l+1
+           << "\"];\n";
+      }
+      else {
+        os << 'p' << node << "[label=\""
+           << cuda_graph_node_type_to_string(type)
+           << "\"];\n";
+      }
+    }
+
+    // precede to parent
+    if(parent != nullptr) {
+      std::unordered_set<cudaGraphNode_t> successors;
+      for(const auto& p : edges) {
+        successors.insert(p.first);
+      }
+      for(auto node : nodes) {
+        if(successors.find(node) == successors.end()) {
+          os << 'p' << node << " -> " << 'p' << parent << ";\n";
+        }
+      }
+    }
+
+    // set the previous level
+    pl = l;
+  }
+
+  for(int i=0; i<=pl; i++) {
+    os << "}\n";
+  }
+}
+
+// ----------------------------------------------------------------------------
+// cudaGraph
+// ----------------------------------------------------------------------------
+  
+/**
+@private
+*/
+struct cudaGraphCreator {
+  cudaGraph_t operator () () const { 
+    cudaGraph_t g;
+    TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph");
+    return g; 
+  }
+};
+
+/**
+@private
+*/
+struct cudaGraphDeleter {
+  void operator () (cudaGraph_t g) const {
+    if(g) {
+      cudaGraphDestroy(g);
+    }
+  }
+};
+
+/**
+@class cudaGraph
+
+@brief class to create an RAII-styled wrapper over a CUDA executable graph
+
+A cudaGraph object is an RAII-styled wrapper over 
+a native CUDA graph (@c cudaGraph_t).
+A cudaGraph object is move-only.
+*/
+class cudaGraph :
+  public cudaObject<cudaGraph_t, cudaGraphCreator, cudaGraphDeleter> {
+
+  public:
+
+  /**
+  @brief constructs an RAII-styled object from the given CUDA exec
+
+  Constructs a cudaGraph object from the given CUDA graph @c native.
+  */
+  explicit cudaGraph(cudaGraph_t native) : cudaObject(native) { }
+  
+  /**
+  @brief constructs a cudaGraph object with a new CUDA graph
+  */
+  cudaGraph() = default;
+};
+
+// ----------------------------------------------------------------------------
+// cudaGraphExec
+// ----------------------------------------------------------------------------
+  
+/**
+@private
+*/
+struct cudaGraphExecCreator {
+  cudaGraphExec_t operator () () const { return nullptr; }
+};
+
+/**
+@private
+*/
+struct cudaGraphExecDeleter {
+  void operator () (cudaGraphExec_t executable) const {
+    if(executable) {
+      cudaGraphExecDestroy(executable);
+    }
+  }
+};
+
+/**
+@class cudaGraphExec
+
+@brief class to create an RAII-styled wrapper over a CUDA executable graph
+
+A cudaGraphExec object is an RAII-styled wrapper over 
+a native CUDA executable graph (@c cudaGraphExec_t).
+A cudaGraphExec object is move-only.
+*/
+class cudaGraphExec : 
+  public cudaObject<cudaGraphExec_t, cudaGraphExecCreator, cudaGraphExecDeleter> {
+
+  public:
+
+  /**
+  @brief constructs an RAII-styled object from the given CUDA exec
+
+  Constructs a cudaGraphExec object which owns @c exec.
+  */
+  explicit cudaGraphExec(cudaGraphExec_t exec) : cudaObject(exec) { }
+  
+  /**
+  @brief default constructor
+  */
+  cudaGraphExec() = default;
+  
+  /**
+  @brief instantiates the exexutable from the given CUDA graph
+  */
+  void instantiate(cudaGraph_t graph) {
+    cudaGraphExecDeleter {} (object);
+    TF_CHECK_CUDA(
+      cudaGraphInstantiate(&object, graph, nullptr, nullptr, 0),
+      "failed to create an executable graph"
+    );
+  }
+  
+  /**
+  @brief updates the exexutable from the given CUDA graph
+  */
+  cudaGraphExecUpdateResult update(cudaGraph_t graph) {
+    cudaGraphNode_t error_node;
+    cudaGraphExecUpdateResult error_result;
+    cudaGraphExecUpdate(object, graph, &error_node, &error_result);
+    return error_result;
+  }
+  
+  /**
+  @brief launchs the executable graph via the given stream
+  */
+  void launch(cudaStream_t stream) {
+    TF_CHECK_CUDA(
+      cudaGraphLaunch(object, stream), "failed to launch a CUDA executable graph"
+    );
+  }
+};
+
+// ----------------------------------------------------------------------------
+// cudaFlowGraph class
+// ----------------------------------------------------------------------------
+
+// class: cudaFlowGraph
+class cudaFlowGraph {
+
+  friend class cudaFlowNode;
+  friend class cudaTask;
+  friend class cudaFlowCapturer;
+  friend class cudaFlow;
+  friend class cudaFlowOptimizerBase;
+  friend class cudaFlowSequentialOptimizer;
+  friend class cudaFlowLinearOptimizer;
+  friend class cudaFlowRoundRobinOptimizer;
+  friend class Taskflow;
+  friend class Executor;
+
+  constexpr static int OFFLOADED = 0x01;
+  constexpr static int CHANGED   = 0x02;
+  constexpr static int UPDATED   = 0x04;
+
+  public:
+
+    cudaFlowGraph() = default;
+    ~cudaFlowGraph() = default;
+
+    cudaFlowGraph(const cudaFlowGraph&) = delete;
+    cudaFlowGraph(cudaFlowGraph&&) = default;
+
+    cudaFlowGraph& operator = (const cudaFlowGraph&) = delete;
+    cudaFlowGraph& operator = (cudaFlowGraph&&) = default;
+
+    template <typename... ArgsT>
+    cudaFlowNode* emplace_back(ArgsT&&...);
+
+    bool empty() const;
+
+    void clear();
+    void dump(std::ostream&, const void*, const std::string&) const ;
+
+  private:
+
+    int _state{CHANGED};
+    cudaGraph _native_handle {nullptr};
+    std::vector<std::unique_ptr<cudaFlowNode>> _nodes;
+};
+
+// ----------------------------------------------------------------------------
+// cudaFlowNode class
+// ----------------------------------------------------------------------------
+
+/**
+@private
+@class: cudaFlowNode
+*/
+class cudaFlowNode {
+
+  friend class cudaFlowGraph;
+  friend class cudaTask;
+  friend class cudaFlow;
+  friend class cudaFlowCapturer;
+  friend class cudaFlowOptimizerBase;
+  friend class cudaFlowSequentialOptimizer;
+  friend class cudaFlowLinearOptimizer;
+  friend class cudaFlowRoundRobinOptimizer;
+  friend class Taskflow;
+  friend class Executor;
+
+  // Empty handle
+  struct Empty {
+  };
+
+  // Host handle
+  struct Host {
+
+    template <typename C>
+    Host(C&&);
+
+    std::function<void()> func;
+
+    static void callback(void*);
+  };
+
+  // Memset handle
+  struct Memset {
+  };
+
+  // Memcpy handle
+  struct Memcpy {
+  };
+
+  // Kernel handle
+  struct Kernel {
+
+    template <typename F>
+    Kernel(F&& f);
+
+    void* func {nullptr};
+  };
+
+  // Subflow handle
+  struct Subflow {
+    cudaFlowGraph cfg;
+  };
+
+  // Capture
+  struct Capture {
+
+    template <typename C>
+    Capture(C&&);
+
+    std::function<void(cudaStream_t)> work;
+
+    cudaEvent_t event;
+    size_t level;
+    size_t lid;
+    size_t idx;
+  };
+
+  using handle_t = std::variant<
+    Empty,
+    Host,
+    Memset,
+    Memcpy,
+    Kernel,
+    Subflow,
+    Capture
+  >;
+
+  public:
+
+  // variant index
+  constexpr static auto EMPTY   = get_index_v<Empty, handle_t>;
+  constexpr static auto HOST    = get_index_v<Host, handle_t>;
+  constexpr static auto MEMSET  = get_index_v<Memset, handle_t>;
+  constexpr static auto MEMCPY  = get_index_v<Memcpy, handle_t>;
+  constexpr static auto KERNEL  = get_index_v<Kernel, handle_t>;
+  constexpr static auto SUBFLOW = get_index_v<Subflow, handle_t>;
+  constexpr static auto CAPTURE = get_index_v<Capture, handle_t>;
+
+    cudaFlowNode() = delete;
+
+    template <typename... ArgsT>
+    cudaFlowNode(cudaFlowGraph&, ArgsT&&...);
+
+  private:
+
+    cudaFlowGraph& _cfg;
+
+    std::string _name;
+
+    handle_t _handle;
+
+    cudaGraphNode_t _native_handle {nullptr};
+
+    SmallVector<cudaFlowNode*> _successors;
+    SmallVector<cudaFlowNode*> _dependents;
+
+    void _precede(cudaFlowNode*);
+};
+
+// ----------------------------------------------------------------------------
+// cudaFlowNode definitions
+// ----------------------------------------------------------------------------
+
+// Host handle constructor
+template <typename C>
+cudaFlowNode::Host::Host(C&& c) : func {std::forward<C>(c)} {
+}
+
+// Host callback
+inline void cudaFlowNode::Host::callback(void* data) {
+  static_cast<Host*>(data)->func();
+};
+
+// Kernel handle constructor
+template <typename F>
+cudaFlowNode::Kernel::Kernel(F&& f) :
+  func {std::forward<F>(f)} {
+}
+
+// Capture handle constructor
+template <typename C>
+cudaFlowNode::Capture::Capture(C&& c) :
+  work {std::forward<C>(c)} {
+}
+
+// Constructor
+template <typename... ArgsT>
+cudaFlowNode::cudaFlowNode(cudaFlowGraph& graph, ArgsT&&... args) :
+  _cfg {graph},
+  _handle {std::forward<ArgsT>(args)...} {
+}
+
+// Procedure: _precede
+inline void cudaFlowNode::_precede(cudaFlowNode* v) {
+
+  _cfg._state |= cudaFlowGraph::CHANGED;
+
+  _successors.push_back(v);
+  v->_dependents.push_back(this);
+
+  // capture node doesn't have the native graph yet
+  if(_handle.index() != cudaFlowNode::CAPTURE) {
+    TF_CHECK_CUDA(
+      cudaGraphAddDependencies(
+        _cfg._native_handle, &_native_handle, &v->_native_handle, 1
+      ),
+      "failed to add a preceding link ", this, "->", v
+    );
+  }
+}
+
+// ----------------------------------------------------------------------------
+// cudaGraph definitions
+// ----------------------------------------------------------------------------
+
+// Function: empty
+inline bool cudaFlowGraph::empty() const {
+  return _nodes.empty();
+}
+
+// Procedure: clear
+inline void cudaFlowGraph::clear() {
+  _state |= cudaFlowGraph::CHANGED;
+  _nodes.clear();
+  _native_handle.clear();
+}
+
+// Function: emplace_back
+template <typename... ArgsT>
+cudaFlowNode* cudaFlowGraph::emplace_back(ArgsT&&... args) {
+
+  _state |= cudaFlowGraph::CHANGED;
+
+  auto node = std::make_unique<cudaFlowNode>(std::forward<ArgsT>(args)...);
+  _nodes.emplace_back(std::move(node));
+  return _nodes.back().get();
+
+  // TODO: use object pool to save memory
+  //auto node = new cudaFlowNode(std::forward<ArgsT>(args)...);
+  //_nodes.push_back(node);
+  //return node;
+}
+
+// Procedure: dump the graph to a DOT format
+inline void cudaFlowGraph::dump(
+  std::ostream& os, const void* root, const std::string& root_name
+) const {
+
+  // recursive dump with stack
+  std::stack<std::tuple<const cudaFlowGraph*, const cudaFlowNode*, int>> stack;
+  stack.push(std::make_tuple(this, nullptr, 1));
+
+  int pl = 0;
+
+  while(!stack.empty()) {
+
+    auto [graph, parent, l] = stack.top();
+    stack.pop();
+
+    for(int i=0; i<pl-l+1; i++) {
+      os << "}\n";
+    }
+
+    if(parent == nullptr) {
+      if(root) {
+        os << "subgraph cluster_p" << root << " {\nlabel=\"cudaFlow: ";
+        if(root_name.empty()) os << 'p' << root;
+        else os << root_name;
+        os << "\";\n" << "color=\"purple\"\n";
+      }
+      else {
+        os << "digraph cudaFlow {\n";
+      }
+    }
+    else {
+      os << "subgraph cluster_p" << parent << " {\nlabel=\"cudaSubflow: ";
+      if(parent->_name.empty()) os << 'p' << parent;
+      else os << parent->_name;
+      os << "\";\n" << "color=\"purple\"\n";
+    }
+
+    for(auto& node : graph->_nodes) {
+
+      auto v = node.get();
+
+      os << 'p' << v << "[label=\"";
+      if(v->_name.empty()) {
+        os << 'p' << v << "\"";
+      }
+      else {
+        os << v->_name << "\"";
+      }
+
+      switch(v->_handle.index()) {
+        case cudaFlowNode::KERNEL:
+          os << " style=\"filled\""
+             << " color=\"white\" fillcolor=\"black\""
+             << " fontcolor=\"white\""
+             << " shape=\"box3d\"";
+        break;
+
+        case cudaFlowNode::SUBFLOW:
+          stack.push(std::make_tuple(
+            &(std::get_if<cudaFlowNode::Subflow>(&v->_handle)->cfg), v, l+1)
+          );
+          os << " style=\"filled\""
+             << " color=\"black\" fillcolor=\"purple\""
+             << " fontcolor=\"white\""
+             << " shape=\"folder\"";
+        break;
+
+        default:
+        break;
+      }
+
+      os << "];\n";
+
+      for(const auto s : v->_successors) {
+        os << 'p' << v << " -> " << 'p' << s << ";\n";
+      }
+
+      if(v->_successors.size() == 0) {
+        if(parent == nullptr) {
+          if(root) {
+            os << 'p' << v << " -> p" << root << ";\n";
+          }
+        }
+        else {
+          os << 'p' << v << " -> p" << parent << ";\n";
+        }
+      }
+    }
+
+    // set the previous level
+    pl = l;
+  }
+
+  for(int i=0; i<pl; i++) {
+    os << "}\n";
+  }
+
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_memory.hpp b/myxpcs/include/taskflow_/cuda/cuda_memory.hpp
new file mode 100644
index 0000000..0740d49
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_memory.hpp
@@ -0,0 +1,855 @@
+#pragma once
+
+#include "cuda_device.hpp"
+
+/**
+@file cuda_memory.hpp
+@brief CUDA memory utilities include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// memory
+// ----------------------------------------------------------------------------
+
+/**
+@brief queries the free memory (expensive call)
+*/
+inline size_t cuda_get_free_mem(int d) {
+  cudaScopedDevice ctx(d);
+  size_t free, total;
+  TF_CHECK_CUDA(
+    cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
+  );
+  return free;
+}
+
+/**
+@brief queries the total available memory (expensive call)
+*/
+inline size_t cuda_get_total_mem(int d) {
+  cudaScopedDevice ctx(d);
+  size_t free, total;
+  TF_CHECK_CUDA(
+    cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
+  );
+  return total;
+}
+
+/**
+@brief allocates memory on the given device for holding @c N elements of type @c T
+
+The function calls @c cudaMalloc to allocate <tt>N*sizeof(T)</tt> bytes of memory
+on the given device @c d and returns a pointer to the starting address of
+the device memory.
+*/
+template <typename T>
+T* cuda_malloc_device(size_t N, int d) {
+  cudaScopedDevice ctx(d);
+  T* ptr {nullptr};
+  TF_CHECK_CUDA(
+    cudaMalloc(&ptr, N*sizeof(T)),
+    "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d
+  )
+  return ptr;
+}
+
+/**
+@brief allocates memory on the current device associated with the caller
+
+The function calls malloc_device from the current device associated
+with the caller.
+*/
+template <typename T>
+T* cuda_malloc_device(size_t N) {
+  T* ptr {nullptr};
+  TF_CHECK_CUDA(
+    cudaMalloc(&ptr, N*sizeof(T)), 
+    "failed to allocate memory (", N*sizeof(T), "bytes)"
+  )
+  return ptr;
+}
+
+/**
+@brief allocates shared memory for holding @c N elements of type @c T
+
+The function calls @c cudaMallocManaged to allocate <tt>N*sizeof(T)</tt> bytes
+of memory and returns a pointer to the starting address of the shared memory.
+*/
+template <typename T>
+T* cuda_malloc_shared(size_t N) {
+  T* ptr {nullptr};
+  TF_CHECK_CUDA(
+    cudaMallocManaged(&ptr, N*sizeof(T)),
+    "failed to allocate shared memory (", N*sizeof(T), "bytes)"
+  )
+  return ptr;
+}
+
+/**
+@brief frees memory on the GPU device
+
+@tparam T pointer type
+@param ptr device pointer to memory to free
+@param d device context identifier
+
+This methods call @c cudaFree to free the memory space pointed to by @c ptr
+using the given device context.
+*/
+template <typename T>
+void cuda_free(T* ptr, int d) {
+  cudaScopedDevice ctx(d);
+  TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d);
+}
+
+/**
+@brief frees memory on the GPU device
+
+@tparam T pointer type
+@param ptr device pointer to memory to free
+
+This methods call @c cudaFree to free the memory space pointed to by @c ptr
+using the current device context of the caller.
+*/
+template <typename T>
+void cuda_free(T* ptr) {
+  TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr);
+}
+
+/**
+@brief copies data between host and device asynchronously through a stream
+
+@param stream stream identifier
+@param dst destination memory address
+@param src source memory address
+@param count size in bytes to copy
+
+The method calls @c cudaMemcpyAsync with the given @c stream
+using @c cudaMemcpyDefault to infer the memory space of the source and
+the destination pointers. The memory areas may not overlap.
+*/
+inline void cuda_memcpy_async(
+  cudaStream_t stream, void* dst, const void* src, size_t count
+) {
+  TF_CHECK_CUDA(
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
+    "failed to perform cudaMemcpyAsync"
+  );
+}
+
+/**
+@brief initializes or sets GPU memory to the given value byte by byte
+
+@param stream stream identifier
+@param devPtr pointer to GPU mempry
+@param value value to set for each byte of the specified memory
+@param count size in bytes to set
+
+The method calls @c cudaMemsetAsync with the given @c stream
+to fill the first @c count bytes of the memory area pointed to by @c devPtr
+with the constant byte value @c value.
+*/
+inline void cuda_memset_async(
+  cudaStream_t stream, void* devPtr, int value, size_t count
+){
+  TF_CHECK_CUDA(
+    cudaMemsetAsync(devPtr, value, count, stream),
+    "failed to perform cudaMemsetAsync"
+  );
+}
+
+// ----------------------------------------------------------------------------
+// Shared Memory
+// ----------------------------------------------------------------------------
+//
+// Because dynamically sized shared memory arrays are declared "extern",
+// we can't templatize them directly.  To get around this, we declare a
+// simple wrapper struct that will declare the extern array with a different
+// name depending on the type.  This avoids compiler errors about duplicate
+// definitions.
+//
+// To use dynamically allocated shared memory in a templatized __global__ or
+// __device__ function, just replace code like this:
+//
+//  template<class T>
+//  __global__ void
+//  foo( T* g_idata, T* g_odata)
+//  {
+//      // Shared mem size is determined by the host app at run time
+//      extern __shared__  T sdata[];
+//      ...
+//      doStuff(sdata);
+//      ...
+//   }
+//
+//  With this:
+//
+//  template<class T>
+//  __global__ void
+//  foo( T* g_idata, T* g_odata)
+//  {
+//      // Shared mem size is determined by the host app at run time
+//      cudaSharedMemory<T> smem;
+//      T* sdata = smem.get();
+//      ...
+//      doStuff(sdata);
+//      ...
+//   }
+// ----------------------------------------------------------------------------
+
+// This is the un-specialized struct.  Note that we prevent instantiation of this
+// struct by putting an undefined symbol in the function body so it won't compile.
+/**
+@private
+*/
+template <typename T>
+struct cudaSharedMemory
+{
+  // Ensure that we won't compile any un-specialized types
+  __device__ T *get()
+  {
+    extern __device__ void error(void);
+    error();
+    return NULL;
+  }
+};
+
+// Following are the specializations for the following types.
+// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
+// One could also specialize it for user-defined types.
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <int>
+{
+  __device__ int *get()
+  {
+    extern __shared__ int s_int[];
+    return s_int;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <unsigned int>
+{
+  __device__ unsigned int *get()
+  {
+    extern __shared__ unsigned int s_uint[];
+    return s_uint;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <char>
+{
+  __device__ char *get()
+  {
+    extern __shared__ char s_char[];
+    return s_char;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <unsigned char>
+{
+  __device__ unsigned char *get()
+  {
+    extern __shared__ unsigned char s_uchar[];
+    return s_uchar;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <short>
+{
+  __device__ short *get()
+  {
+    extern __shared__ short s_short[];
+    return s_short;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <unsigned short>
+{
+  __device__ unsigned short *get()
+  {
+    extern __shared__ unsigned short s_ushort[];
+    return s_ushort;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <long>
+{
+  __device__ long *get()
+  {
+    extern __shared__ long s_long[];
+    return s_long;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <unsigned long>
+{
+  __device__ unsigned long *get()
+  {
+    extern __shared__ unsigned long s_ulong[];
+    return s_ulong;
+  }
+};
+
+//template <>
+//struct cudaSharedMemory <size_t>
+//{
+//  __device__ size_t *get()
+//  {
+//    extern __shared__ size_t s_sizet[];
+//    return s_sizet;
+//  }
+//};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <bool>
+{
+  __device__ bool *get()
+  {
+    extern __shared__ bool s_bool[];
+    return s_bool;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <float>
+{
+  __device__ float *get()
+  {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+/**
+@private
+*/
+template <>
+struct cudaSharedMemory <double>
+{
+  __device__ double *get()
+  {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+
+
+// ----------------------------------------------------------------------------
+// cudaDeviceAllocator
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaDeviceAllocator
+
+@brief class to create a CUDA device allocator 
+
+@tparam T element type
+
+A %cudaDeviceAllocator enables device-specific allocation for 
+standard library containers. It is typically passed as template parameter 
+when declaring standard library containers (e.g. std::vector).
+*/
+template<typename T>
+class cudaDeviceAllocator {
+
+  public:
+
+  /**
+  @brief element type
+  */
+  using value_type = T;
+
+  /**
+  @brief element pointer type
+  */
+  using pointer = T*;
+
+  /**
+  @brief element reference type
+  */
+  using reference = T&;
+
+  /**
+  @brief const element pointer type
+  */
+  using const_pointer = const T*;
+
+  /**
+  @brief constant element reference type
+  */
+  using const_reference = const T&;
+
+  /**
+  @brief size type
+  */
+  using size_type = std::size_t;
+  
+  /**
+  @brief pointer difference type
+  */
+  using difference_type = std::ptrdiff_t;
+
+  /**
+  @brief its member type @c U is the equivalent allocator type to allocate elements of type U
+  */
+  template<typename U> 
+  struct rebind { 
+    /**
+    @brief allocator of a different data type
+    */
+    using other = cudaDeviceAllocator<U>; 
+  }; 
+
+  /** 
+  @brief Constructs a device allocator object.
+  */
+  cudaDeviceAllocator() noexcept {}
+
+  /**
+  @brief Constructs a device allocator object from another device allocator object.
+  */
+  cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {}
+
+  /**
+  @brief Constructs a device allocator object from another device allocator 
+         object with a different element type.
+  */
+  template<typename U>
+  cudaDeviceAllocator( const cudaDeviceAllocator<U>& ) noexcept {}
+
+  /**
+  @brief Destructs the device allocator object.
+  */
+  ~cudaDeviceAllocator() noexcept {}
+
+  /**
+  @brief Returns the address of x.
+  
+  This effectively means returning &x.
+  
+  @param x reference to an object
+  @return a pointer to the object
+  */
+  pointer address( reference x ) { return &x; }
+
+  /**
+  @brief Returns the address of x.
+  
+  This effectively means returning &x.
+  
+  @param x reference to an object
+  @return a pointer to the object
+  */
+  const_pointer address( const_reference x ) const { return &x; }
+
+  /** 
+  @brief allocates block of storage.
+  
+  Attempts to allocate a block of storage with a size large enough to contain 
+  @c n elements of member type, @c value_type, and returns a pointer 
+  to the first element.
+  
+  The storage is aligned appropriately for object of type @c value_type, 
+  but they are not constructed.
+  
+  The block of storage is allocated using cudaMalloc and throws std::bad_alloc 
+  if it cannot allocate the total amount of storage requested.
+  
+  @param n number of elements (each of size sizeof(value_type)) to be allocated
+  @return a pointer to the initial element in the block of storage.
+  */
+  pointer allocate( size_type n, std::allocator<void>::const_pointer = 0 )
+  {
+    void* ptr = NULL;
+    TF_CHECK_CUDA(
+      cudaMalloc( &ptr, n*sizeof(T) ),
+      "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
+    )
+    return static_cast<pointer>(ptr);
+  }
+
+  /** 
+  @brief Releases a block of storage previously allocated with member allocate and not yet released
+  
+  The elements in the array are not destroyed by a call to this member function.
+  
+  @param ptr pointer to a block of storage previously allocated with allocate
+  */
+  void deallocate( pointer ptr, size_type )
+  {
+    if(ptr){
+      cudaFree(ptr);
+    }
+  }
+
+  /**
+  @brief returns the maximum number of elements that could potentially 
+         be allocated by this allocator
+  
+  A call to member allocate with the value returned by this function 
+  can still fail to allocate the requested storage.
+  
+  @return the nubmer of elements that might be allcoated as maximum 
+          by a call to member allocate
+  */
+  size_type max_size() const noexcept { return size_type {-1}; }
+
+  /**
+  @brief ignored to avoid de-referencing device pointer from the host
+  */
+  void construct( pointer, const_reference) { }
+
+  /**
+  @brief ignored to avoid de-referencing device pointer from the host
+  */
+  void destroy( pointer) { }
+  
+  /**
+  @brief compares two allocator of different types using @c ==
+
+  Device allocators of different types are always equal to each other
+  because the storage allocated by the allocator @c a1 can be deallocated 
+  through @c a2. 
+  */
+  template <typename U>
+  bool operator == (const cudaDeviceAllocator<U>&) const noexcept {
+    return true;
+  }
+  
+  /**
+  @brief compares two allocator of different types using @c !=
+
+  Device allocators of different types are always equal to each other
+  because the storage allocated by the allocator @c a1 can be deallocated 
+  through @c a2. 
+  */
+  template <typename U>
+  bool operator != (const cudaDeviceAllocator<U>&) const noexcept {
+    return false;
+  }
+
+};
+
+// ----------------------------------------------------------------------------
+// cudaUSMAllocator
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaUSMAllocator
+
+@brief class to create a unified shared memory (USM) allocator 
+
+@tparam T element type
+
+A %cudaUSMAllocator enables using unified shared memory (USM) allocation for 
+standard library containers. It is typically passed as template parameter 
+when declaring standard library containers (e.g. std::vector).
+*/
+template<typename T>
+class cudaUSMAllocator {
+
+  public:
+
+  /**
+  @brief element type
+  */
+  using value_type = T;
+
+  /**
+  @brief element pointer type
+  */
+  using pointer = T*;
+
+  /**
+  @brief element reference type
+  */
+  using reference = T&;
+
+  /**
+  @brief const element pointer type
+  */
+  using const_pointer = const T*;
+
+  /**
+  @brief constant element reference type
+  */
+  using const_reference = const T&;
+
+  /**
+  @brief size type
+  */
+  using size_type = std::size_t;
+  
+  /**
+  @brief pointer difference type
+  */
+  using difference_type = std::ptrdiff_t;
+
+  /**
+  @brief its member type @c U is the equivalent allocator type to allocate elements of type U
+  */
+  template<typename U> 
+  struct rebind { 
+    /**
+    @brief allocator of a different data type
+    */
+    using other = cudaUSMAllocator<U>; 
+  }; 
+
+  /** 
+  @brief Constructs a device allocator object.
+  */
+  cudaUSMAllocator() noexcept {}
+
+  /**
+  @brief Constructs a device allocator object from another device allocator object.
+  */
+  cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}
+
+  /**
+  @brief Constructs a device allocator object from another device allocator 
+         object with a different element type.
+  */
+  template<typename U>
+  cudaUSMAllocator( const cudaUSMAllocator<U>& ) noexcept {}
+
+  /**
+  @brief Destructs the device allocator object.
+  */
+  ~cudaUSMAllocator() noexcept {}
+
+  /**
+  @brief Returns the address of x.
+  
+  This effectively means returning &x.
+  
+  @param x reference to an object
+  @return a pointer to the object
+  */
+  pointer address( reference x ) { return &x; }
+
+  /**
+  @brief Returns the address of x.
+  
+  This effectively means returning &x.
+  
+  @param x reference to an object
+  @return a pointer to the object
+  */
+  const_pointer address( const_reference x ) const { return &x; }
+
+  /** 
+  @brief allocates block of storage.
+  
+  Attempts to allocate a block of storage with a size large enough to contain 
+  @c n elements of member type, @c value_type, and returns a pointer 
+  to the first element.
+  
+  The storage is aligned appropriately for object of type @c value_type, 
+  but they are not constructed.
+  
+  The block of storage is allocated using cudaMalloc and throws std::bad_alloc 
+  if it cannot allocate the total amount of storage requested.
+  
+  @param n number of elements (each of size sizeof(value_type)) to be allocated
+  @return a pointer to the initial element in the block of storage.
+  */
+  pointer allocate( size_type n, std::allocator<void>::const_pointer = 0 )
+  {
+    void* ptr {nullptr};
+    TF_CHECK_CUDA(
+      cudaMallocManaged( &ptr, n*sizeof(T) ),
+      "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
+    )
+    return static_cast<pointer>(ptr);
+  }
+
+  /** 
+  @brief Releases a block of storage previously allocated with member allocate and not yet released
+  
+  The elements in the array are not destroyed by a call to this member function.
+  
+  @param ptr pointer to a block of storage previously allocated with allocate
+  */
+  void deallocate( pointer ptr, size_type )
+  {
+    if(ptr){
+      cudaFree(ptr);
+    }
+  }
+
+  /**
+  @brief returns the maximum number of elements that could potentially 
+         be allocated by this allocator
+  
+  A call to member allocate with the value returned by this function 
+  can still fail to allocate the requested storage.
+  
+  @return the nubmer of elements that might be allcoated as maximum 
+          by a call to member allocate
+  */
+  size_type max_size() const noexcept { return size_type {-1}; }
+
+  /**
+  @brief Constructs an element object on the location pointed by ptr.
+  @param ptr pointer to a location with enough storage soace to contain 
+             an element of type @c value_type
+
+  @param val value to initialize the constructed element to
+  */
+  void construct( pointer ptr, const_reference val ) {
+    new ((void*)ptr) value_type(val);
+  }
+
+  /**
+  @brief destroys in-place the object pointed by @c ptr
+  
+  Notice that this does not deallocate the storage for the element but calls
+  its destructor.
+
+  @param ptr pointer to the object to be destroye
+  */
+  void destroy( pointer ptr ) {
+    ptr->~value_type();
+  }
+
+  /**
+  @brief compares two allocator of different types using @c ==
+
+  USM allocators of different types are always equal to each other
+  because the storage allocated by the allocator @c a1 can be deallocated 
+  through @c a2. 
+  */
+  template <typename U>
+  bool operator == (const cudaUSMAllocator<U>&) const noexcept {
+    return true;
+  }
+  
+  /**
+  @brief compares two allocator of different types using @c !=
+
+  USM allocators of different types are always equal to each other
+  because the storage allocated by the allocator @c a1 can be deallocated 
+  through @c a2. 
+  */
+  template <typename U>
+  bool operator != (const cudaUSMAllocator<U>&) const noexcept {
+    return false;
+  }
+
+};
+
+// ----------------------------------------------------------------------------
+// GPU vector object
+// ----------------------------------------------------------------------------
+
+//template <typename T>
+//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>;
+
+//template <typename T>
+//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>;
+
+/**
+@private
+*/
+template <typename T>
+class cudaDeviceVector {
+  
+  public:
+
+    cudaDeviceVector() = default;
+
+    cudaDeviceVector(size_t N) : _N {N} {
+      if(N) {
+        TF_CHECK_CUDA(
+          cudaMalloc(&_data, N*sizeof(T)),
+          "failed to allocate device memory (", N*sizeof(T), " bytes)"
+        );
+      }
+    }
+    
+    cudaDeviceVector(cudaDeviceVector&& rhs) : 
+      _data{rhs._data}, _N {rhs._N} {
+      rhs._data = nullptr;
+      rhs._N    = 0;
+    }
+
+    ~cudaDeviceVector() {
+      if(_data) {
+        cudaFree(_data);
+      }
+    }
+
+    cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {
+      if(_data) {
+        cudaFree(_data);
+      }
+      _data = rhs._data;
+      _N    = rhs._N;
+      rhs._data = nullptr;
+      rhs._N    = 0;
+      return *this;
+    }
+
+    size_t size() const { return _N; }
+
+    T* data() { return _data; }
+    const T* data() const { return _data; }
+    
+    cudaDeviceVector(const cudaDeviceVector&) = delete;
+    cudaDeviceVector& operator = (const cudaDeviceVector&) = delete;
+
+  private:
+
+    T* _data  {nullptr};
+    size_t _N {0};
+}; 
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_meta.hpp b/myxpcs/include/taskflow_/cuda/cuda_meta.hpp
new file mode 100644
index 0000000..b08eb29
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_meta.hpp
@@ -0,0 +1,452 @@
+#pragma once
+
+#include "cuda_execution_policy.hpp"
+
+namespace tf {
+
+// default warp size
+inline constexpr unsigned CUDA_WARP_SIZE = 32;
+
+// empty type
+struct cudaEmpty { };
+
+// ----------------------------------------------------------------------------
+// iterator unrolling
+// ----------------------------------------------------------------------------
+
+// Template unrolled looping construct.
+template<unsigned i, unsigned count, bool valid = (i < count)>
+struct cudaIterate {
+  template<typename F>
+  __device__ static void eval(F f) {
+    f(i);
+    cudaIterate<i + 1, count>::eval(f);
+  }
+};
+
+template<unsigned i, unsigned count>
+struct cudaIterate<i, count, false> {
+  template<typename F>
+  __device__ static void eval(F) { }
+};
+
+template<unsigned begin, unsigned end, typename F>
+__device__ void cuda_iterate(F f) {
+  cudaIterate<begin, end>::eval(f);
+}
+
+template<unsigned count, typename F>
+__device__ void cuda_iterate(F f) {
+  cuda_iterate<0, count>(f);
+}
+
+template<unsigned count, typename T>
+__device__ T reduce(const T(&x)[count]) {
+  T y;
+  cuda_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; });
+  return y;
+}
+
+template<unsigned count, typename T>
+__device__ void fill(T(&x)[count], T val) {
+  cuda_iterate<count>([&](auto i) { x[i] = val; });
+}
+
+// Invoke unconditionally.
+template<unsigned nt, unsigned vt, typename F>
+__device__ void cuda_strided_iterate(F f, unsigned tid) {
+  cuda_iterate<vt>([=](auto i) { f(i, nt * i + tid); });
+}
+
+// Check range.
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F>
+__device__ void cuda_strided_iterate(F f, unsigned tid, unsigned count) {
+  // Unroll the first vt0 elements of each thread.
+  if(vt0 > 1 && count >= nt * vt0) {
+    cuda_strided_iterate<nt, vt0>(f, tid);    // No checking
+  } else {
+    cuda_iterate<vt0>([=](auto i) {
+      auto j = nt * i + tid;
+      if(j < count) f(i, j);
+    });
+  }
+
+  // TODO: seems dummy when vt0 == vt
+  cuda_iterate<vt0, vt>([=](auto i) {
+    auto j = nt * i + tid;
+    if(j < count) f(i, j);
+  });
+}
+
+template<unsigned vt, typename F>
+__device__ void cuda_thread_iterate(F f, unsigned tid) {
+  cuda_iterate<vt>([=](auto i) { f(i, vt * tid + i); });
+}
+
+// ----------------------------------------------------------------------------
+// cudaRange
+// ----------------------------------------------------------------------------
+
+// cudaRange
+struct cudaRange {
+  unsigned begin, end;
+  __device__ unsigned size() const { return end - begin; }
+  __device__ unsigned count() const { return size(); }
+  __device__ bool valid() const { return end > begin; }
+};
+
+inline __device__ cudaRange cuda_get_tile(unsigned b, unsigned nv, unsigned count) {
+  return cudaRange { nv * b, min(count, nv * (b + 1)) };
+}
+
+
+// ----------------------------------------------------------------------------
+// cudaArray
+// ----------------------------------------------------------------------------
+
+template<typename T, unsigned size>
+struct cudaArray {
+  T data[size];
+
+  __device__ T operator[](unsigned i) const { return data[i]; }
+  __device__ T& operator[](unsigned i) { return data[i]; }
+
+  cudaArray() = default;
+  cudaArray(const cudaArray&) = default;
+  cudaArray& operator=(const cudaArray&) = default;
+
+  // Fill the array with x.
+  __device__ cudaArray(T x) {
+    cuda_iterate<size>([&](unsigned i) { data[i] = x; });
+  }
+};
+
+template<typename T>
+struct cudaArray<T, 0> {
+  __device__ T operator[](unsigned) const { return T(); }
+  __device__ T& operator[](unsigned) { return *(T*)nullptr; }
+};
+
+template<typename T, typename V, unsigned size>
+struct cudaKVArray {
+  cudaArray<T, size> keys;
+  cudaArray<V, size> vals;
+};
+
+// ----------------------------------------------------------------------------
+// thread reg <-> global mem
+// ----------------------------------------------------------------------------
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I>
+__device__ auto cuda_mem_to_reg_strided(I mem, unsigned tid, unsigned count) {
+  using T = typename std::iterator_traits<I>::value_type;
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt, vt0>(
+    [&](auto i, auto j) { x[i] = mem[j]; }, tid, count
+  );
+  return x;
+}
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t>
+__device__ void cuda_reg_to_mem_strided(
+  cudaArray<T, vt> x, unsigned tid, unsigned count, it_t mem) {
+
+  cuda_strided_iterate<nt, vt, vt0>(
+    [=](auto i, auto j) { mem[j] = x[i]; }, tid, count
+  );
+}
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O>
+__device__ auto cuda_transform_mem_to_reg_strided(
+  I mem, unsigned tid, unsigned count, O op
+) {
+  using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>;
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt, vt0>(
+    [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count
+  );
+  return x;
+}
+
+// ----------------------------------------------------------------------------
+// thread reg <-> shared
+// ----------------------------------------------------------------------------
+
+template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+__device__ void cuda_reg_to_shared_thread(
+  cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
+) {
+
+  static_assert(shared_size >= nt * vt,
+    "reg_to_shared_thread must have at least nt * vt storage");
+
+  cuda_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid);
+
+  if(sync) __syncthreads();
+}
+
+template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+__device__ auto cuda_shared_to_reg_thread(
+  const T (&shared)[shared_size], unsigned tid, bool sync = true
+) {
+
+  static_assert(shared_size >= nt * vt,
+    "reg_to_shared_thread must have at least nt * vt storage");
+
+  cudaArray<T, vt> x;
+  cuda_thread_iterate<vt>([&](auto i, auto j) {
+    x[i] = shared[j];
+  }, tid);
+
+  if(sync) __syncthreads();
+
+  return x;
+}
+
+template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+__device__ void cuda_reg_to_shared_strided(
+  cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
+) {
+
+  static_assert(shared_size >= nt * vt,
+    "reg_to_shared_strided must have at least nt * vt storage");
+
+  cuda_strided_iterate<nt, vt>(
+    [&](auto i, auto j) { shared[j] = x[i]; }, tid
+  );
+
+  if(sync) __syncthreads();
+}
+
+template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+__device__ auto cuda_shared_to_reg_strided(
+  const T (&shared)[shared_size], unsigned tid, bool sync = true
+) {
+
+  static_assert(shared_size >= nt * vt,
+    "shared_to_reg_strided must have at least nt * vt storage");
+
+  cudaArray<T, vt> x;
+  cuda_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid);
+  if(sync) __syncthreads();
+
+  return x;
+}
+
+template<
+  unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
+  unsigned shared_size
+>
+__device__ auto cuda_reg_to_mem_thread(
+  cudaArray<T, vt> x, unsigned tid,
+  unsigned count, it_t mem, T (&shared)[shared_size]
+) {
+  cuda_reg_to_shared_thread<nt>(x, tid, shared);
+  auto y = cuda_shared_to_reg_strided<nt, vt>(shared, tid);
+  cuda_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
+}
+
+template<
+  unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
+  unsigned shared_size
+>
+__device__ auto cuda_mem_to_reg_thread(
+  it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size]
+) {
+
+  auto x = cuda_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
+  cuda_reg_to_shared_strided<nt, vt>(x, tid, shared);
+  auto y = cuda_shared_to_reg_thread<nt, vt>(shared, tid);
+  return y;
+}
+
+template<unsigned nt, unsigned vt, typename T, unsigned S>
+__device__ auto cuda_shared_gather(
+  const T(&data)[S], cudaArray<unsigned, vt> indices, bool sync = true
+) {
+
+  static_assert(S >= nt * vt,
+    "shared_gather must have at least nt * vt storage");
+
+  cudaArray<T, vt> x;
+  cuda_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; });
+
+  if(sync) __syncthreads();
+
+  return x;
+}
+
+
+
+// ----------------------------------------------------------------------------
+// reg<->reg
+// ----------------------------------------------------------------------------
+
+template<unsigned nt, unsigned vt, typename T, unsigned S>
+__device__ auto cuda_reg_thread_to_strided(
+  cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
+) {
+  cuda_reg_to_shared_thread<nt>(x, tid, shared);
+  return cuda_shared_to_reg_strided<nt, vt>(shared, tid);
+}
+
+template<unsigned nt, unsigned vt, typename T, unsigned S>
+__device__ auto cuda_reg_strided_to_thread(
+  cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
+) {
+  cuda_reg_to_shared_strided<nt>(x, tid, shared);
+  return cuda_shared_to_reg_thread<nt, vt>(shared, tid);
+}
+
+// ----------------------------------------------------------------------------
+// cudaLoadStoreIterator
+// ----------------------------------------------------------------------------
+
+template<typename L, typename S, typename T, typename I>
+struct cudaLoadStoreIterator : std::iterator_traits<const T*> {
+
+  L load;
+  S store;
+  I base;
+
+  cudaLoadStoreIterator(L load_, S store_, I base_) :
+    load(load_), store(store_), base(base_) { }
+
+  struct assign_t {
+    L load;
+    S store;
+    I index;
+
+    __device__ assign_t& operator=(T rhs) {
+      static_assert(!std::is_same<S, cudaEmpty>::value,
+        "load_iterator is being stored to.");
+      store(rhs, index);
+      return *this;
+    }
+    __device__ operator T() const {
+      static_assert(!std::is_same<L, cudaEmpty>::value,
+        "store_iterator is being loaded from.");
+      return load(index);
+    }
+  };
+
+  __device__ assign_t operator[](I index) const {
+    return assign_t { load, store, base + index };
+  }
+
+  __device__ assign_t operator*() const {
+    return assign_t { load, store, base };
+  }
+
+  __device__ cudaLoadStoreIterator operator+(I offset) const {
+    cudaLoadStoreIterator cp = *this;
+    cp += offset;
+    return cp;
+  }
+
+  __device__ cudaLoadStoreIterator& operator+=(I offset) {
+    base += offset;
+    return *this;
+  }
+
+  __device__ cudaLoadStoreIterator operator-(I offset) const {
+    cudaLoadStoreIterator cp = *this;
+    cp -= offset;
+    return cp;
+  }
+
+  __device__ cudaLoadStoreIterator& operator-=(I offset) {
+    base -= offset;
+    return *this;
+  }
+};
+
+//template<typename T>
+//struct trivial_load_functor {
+//  template<typename I>
+//  __device__ T operator()(I index) const {
+//    return T();
+//  }
+//};
+
+//template<typename T>
+//struct trivial_store_functor {
+//  template<typename I>
+//  __device__ void operator()(T v, I index) const { }
+//};
+
+template <typename T, typename I = unsigned, typename L, typename S>
+auto cuda_make_load_store_iterator(L load, S store, I base = 0) {
+  return cudaLoadStoreIterator<L, S, T, I>(load, store, base);
+}
+
+template <typename T, typename I = unsigned, typename L>
+auto cuda_make_load_iterator(L load, I base = 0) {
+  return cuda_make_load_store_iterator<T>(load, cudaEmpty(), base);
+}
+
+template <typename T, typename I = unsigned, typename S>
+auto cuda_make_store_iterator(S store, I base = 0) {
+  return cuda_make_load_store_iterator<T>(cudaEmpty(), store, base);
+}
+
+// ----------------------------------------------------------------------------
+// swap
+// ----------------------------------------------------------------------------
+
+template<typename T>
+__device__ void cuda_swap(T& a, T& b) {
+  auto c = a;
+  a = b;
+  b = c;
+}
+
+// ----------------------------------------------------------------------------
+// launch kernel
+// ----------------------------------------------------------------------------
+
+template<typename F, typename... args_t>
+__global__ void cuda_kernel(F f, args_t... args) {
+  f(threadIdx.x, blockIdx.x, args...);
+}
+
+// ----------------------------------------------------------------------------
+// operators
+// ----------------------------------------------------------------------------
+
+template <class T>
+struct cuda_plus{
+  __device__ T operator()(T a, T b) const { return a + b; }
+};
+
+ template <class T>
+struct cuda_minus{
+  __device__ T operator()(T a, T b) const { return a - b; }
+};
+
+template <class T>
+struct cuda_multiplies{
+  __device__ T operator()(T a, T b) const { return a * b; }
+};
+
+template <class T>
+struct cuda_maximum{
+  __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+};
+
+template <class T>
+struct cuda_minimum{
+  __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+};
+
+template <class T>
+struct cuda_less{
+  __device__ T operator()(T a, T b) const { return a < b; }
+};
+
+template <class T>
+struct cuda_greater{
+  __device__ T operator()(T a, T b) const { return a > b; }
+};
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/cuda/cuda_object.hpp b/myxpcs/include/taskflow_/cuda/cuda_object.hpp
new file mode 100644
index 0000000..e30d3a5
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_object.hpp
@@ -0,0 +1,287 @@
+#pragma once
+
+#include "cuda_error.hpp"
+
+namespace tf {
+
+/**
+@brief per-thread object pool to manage CUDA device object
+
+@tparam H object type
+@tparam C function object to create a library object
+@tparam D function object to delete a library object
+
+A CUDA device object has a lifetime associated with a device,
+for example, @c cudaStream_t, @c cublasHandle_t, etc.
+Creating a device object is typically expensive (e.g., 10-200 ms)
+and destroying it may trigger implicit device synchronization.
+For applications tha intensively make use of device objects,
+it is desirable to reuse them as much as possible.
+
+There exists an one-to-one relationship between CUDA devices in CUDA Runtime API
+and CUcontexts in the CUDA Driver API within a process.
+The specific context which the CUDA Runtime API uses for a device
+is called the device's primary context.
+From the perspective of the CUDA Runtime API,
+a device and its primary context are synonymous.
+
+We design the device object pool in a decentralized fashion by keeping
+(1) a global pool to keep track of potentially usable objects and
+(2) a per-thread pool to footprint objects with shared ownership.
+The global pool does not own the object and therefore does not destruct any of them.
+The per-thread pool keeps the footprints of objects with shared ownership
+and will destruct them if the thread holds the last reference count after it joins.
+The motivation of this decentralized control is to avoid device objects
+from being destroyed while the context had been destroyed due to driver shutdown.
+
+*/
+template <typename H, typename C, typename D>
+class cudaPerThreadDeviceObjectPool {
+
+  public:
+
+  /**
+  @brief structure to store a context object
+   */
+  struct Object {
+
+    int device;
+    H value;
+
+    Object(int);
+    ~Object();
+
+    Object(const Object&) = delete;
+    Object(Object&&) = delete;
+  };
+
+  private:
+
+  // Master thread hold the storage to the pool.
+  // Due to some ordering, cuda context may be destroyed when the master
+  // program thread destroys the cuda object.
+  // Therefore, we use a decentralized approach to let child thread
+  // destroy cuda objects while the master thread only keeps a weak reference
+  // to those objects for reuse.
+  struct cudaGlobalDeviceObjectPool {
+
+    std::shared_ptr<Object> acquire(int);
+    void release(int, std::weak_ptr<Object>);
+
+    std::mutex mutex;
+    std::unordered_map<int, std::vector<std::weak_ptr<Object>>> pool;
+  };
+
+  public:
+
+    /**
+    @brief default constructor
+     */
+    cudaPerThreadDeviceObjectPool() = default;
+
+    /**
+    @brief acquires a device object with shared ownership
+     */
+    std::shared_ptr<Object> acquire(int);
+
+    /**
+    @brief releases a device object with moved ownership
+    */
+    void release(std::shared_ptr<Object>&&);
+
+    /**
+    @brief queries the number of device objects with shared ownership
+     */
+    size_t footprint_size() const;
+
+  private:
+
+    inline static cudaGlobalDeviceObjectPool _shared_pool;
+
+    std::unordered_set<std::shared_ptr<Object>> _footprint;
+};
+
+// ----------------------------------------------------------------------------
+// cudaPerThreadDeviceObject::cudaHanale definition
+// ----------------------------------------------------------------------------
+
+template <typename H, typename C, typename D>
+cudaPerThreadDeviceObjectPool<H, C, D>::Object::Object(int d) :
+  device {d} {
+  cudaScopedDevice ctx(device);
+  value = C{}();
+}
+
+template <typename H, typename C, typename D>
+cudaPerThreadDeviceObjectPool<H, C, D>::Object::~Object() {
+  cudaScopedDevice ctx(device);
+  D{}(value);
+}
+
+// ----------------------------------------------------------------------------
+// cudaPerThreadDeviceObject::cudaHanaldePool definition
+// ----------------------------------------------------------------------------
+
+template <typename H, typename C, typename D>
+std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object>
+cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::acquire(int d) {
+  std::scoped_lock<std::mutex> lock(mutex);
+  if(auto itr = pool.find(d); itr != pool.end()) {
+    while(!itr->second.empty()) {
+      auto sptr = itr->second.back().lock();
+      itr->second.pop_back();
+      if(sptr) {
+        return sptr;
+      }
+    }
+  }
+  return nullptr;
+}
+
+template <typename H, typename C, typename D>
+void cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::release(
+  int d, std::weak_ptr<Object> ptr
+) {
+  std::scoped_lock<std::mutex> lock(mutex);
+  pool[d].push_back(ptr);
+}
+
+// ----------------------------------------------------------------------------
+// cudaPerThreadDeviceObject definition
+// ----------------------------------------------------------------------------
+
+template <typename H, typename C, typename D>
+std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object>
+cudaPerThreadDeviceObjectPool<H, C, D>::acquire(int d) {
+
+  auto ptr = _shared_pool.acquire(d);
+
+  if(!ptr) {
+    ptr = std::make_shared<Object>(d);
+  }
+
+  return ptr;
+}
+
+template <typename H, typename C, typename D>
+void cudaPerThreadDeviceObjectPool<H, C, D>::release(
+  std::shared_ptr<Object>&& ptr
+) {
+  _shared_pool.release(ptr->device, ptr);
+  _footprint.insert(std::move(ptr));
+}
+
+template <typename H, typename C, typename D>
+size_t cudaPerThreadDeviceObjectPool<H, C, D>::footprint_size() const {
+  return _footprint.size();
+}
+
+// ----------------------------------------------------------------------------
+// cudaObject
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaObject
+
+@brief class to create an RAII-styled and move-only wrapper for CUDA objects
+*/
+template <typename T, typename C, typename D>
+class cudaObject {
+  
+  public:
+
+  /**
+  @brief constructs a CUDA object from the given one
+  */
+  explicit cudaObject(T obj) : object(obj) {}
+  
+  /**
+  @brief constructs a new CUDA object
+  */
+  cudaObject() : object{ C{}() } {}
+    
+  /**
+  @brief disabled copy constructor
+  */
+  cudaObject(const cudaObject&) = delete;
+  
+  /**
+  @brief move constructor
+  */
+  cudaObject(cudaObject&& rhs) : object{rhs.object} {
+    rhs.object = nullptr;
+  }
+
+  /**
+  @brief destructs the CUDA object
+  */
+  ~cudaObject() { D{}(object); }
+  
+  /**
+  @brief disabled copy assignment
+  */
+  cudaObject& operator = (const cudaObject&) = delete;
+
+  /**
+  @brief move assignment
+  */
+  cudaObject& operator = (cudaObject&& rhs) {
+    D {} (object);
+    object = rhs.object;
+    rhs.object = nullptr;
+    return *this;
+  }
+  
+  /**
+  @brief implicit conversion to the native CUDA stream (cudaObject_t)
+
+  Returns the underlying stream of type @c cudaObject_t.
+  */
+  operator T () const {
+    return object;
+  }
+    
+  /**
+  @brief deletes the current CUDA object (if any) and creates a new one
+  */
+  void create() {
+    D {} (object);
+    object = C{}();
+  }
+  
+  /**
+  @brief resets this CUDA object to the given one
+  */
+  void reset(T new_obj) {
+    D {} (object);
+    object = new_obj;
+  }
+  
+  /**
+  @brief deletes the current CUDA object
+  */
+  void clear() {
+    reset(nullptr);
+  }
+
+  /**
+  @brief releases the ownership of the CUDA object
+  */
+  T release() {
+    auto tmp = object;
+    object = nullptr;
+    return tmp;
+  }
+  
+  protected:
+
+  /**
+  @brief the CUDA object
+  */
+  T object;
+};
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp b/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp
new file mode 100644
index 0000000..60efed1
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp
@@ -0,0 +1,404 @@
+#pragma once
+
+#include "cuda_graph.hpp"
+
+/**
+@file cuda_optimizer.hpp
+@brief %cudaFlow capturing algorithms include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// cudaFlowOptimizerBase
+// ----------------------------------------------------------------------------
+
+/**
+@private
+
+@brief class to provide helper common methods for optimization algorithms
+*/
+class cudaFlowOptimizerBase {
+
+  protected:
+
+    std::vector<cudaFlowNode*> _toposort(cudaFlowGraph&);
+    std::vector<std::vector<cudaFlowNode*>> _levelize(cudaFlowGraph&);
+};
+
+// Function: _toposort
+inline std::vector<cudaFlowNode*> cudaFlowOptimizerBase::_toposort(cudaFlowGraph& graph) {
+
+  std::vector<cudaFlowNode*> res;
+  std::queue<cudaFlowNode*> bfs;
+
+  res.reserve(graph._nodes.size());
+
+  // insert the first level of nodes into the queue
+  for(auto& u : graph._nodes) {
+
+    auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
+    hu->level = u->_dependents.size();
+
+    if(hu->level == 0) {
+      bfs.push(u.get());
+    }
+  }
+
+  // levelize the graph using bfs
+  while(!bfs.empty()) {
+
+    auto u = bfs.front();
+    bfs.pop();
+
+    res.push_back(u);
+
+    for(auto v : u->_successors) {
+      auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle);
+      if(--hv->level == 0) {
+        bfs.push(v);
+      }
+    }
+  }
+
+  return res;
+}
+
+// Function: _levelize
+inline std::vector<std::vector<cudaFlowNode*>>
+cudaFlowOptimizerBase::_levelize(cudaFlowGraph& graph) {
+
+  std::queue<cudaFlowNode*> bfs;
+
+  size_t max_level = 0;
+
+  // insert the first level of nodes into the queue
+  for(auto& u : graph._nodes) {
+
+    auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
+    hu->level = u->_dependents.size();
+
+    if(hu->level == 0) {
+      bfs.push(u.get());
+    }
+  }
+
+  // levelize the graph using bfs
+  while(!bfs.empty()) {
+
+    auto u = bfs.front();
+    bfs.pop();
+
+    auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
+
+    for(auto v : u->_successors) {
+      auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle);
+      if(--hv->level == 0) {
+        hv->level = hu->level + 1;
+        if(hv->level > max_level) {
+          max_level = hv->level;
+        }
+        bfs.push(v);
+      }
+    }
+  }
+
+  // set level_graph and each node's idx
+  std::vector<std::vector<cudaFlowNode*>> level_graph(max_level+1);
+  for(auto& u : graph._nodes) {
+    auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
+    hu->lid = level_graph[hu->level].size();
+    level_graph[hu->level].emplace_back(u.get());
+
+    //for(auto s : u->_successors) {
+    //  assert(hu.level < std::get_if<cudaFlowNode::Capture>(&s->_handle)->level);
+    //}
+  }
+
+  return level_graph;
+}
+
+// ----------------------------------------------------------------------------
+// class definition: cudaFlowSequentialOptimizer
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaFlowSequentialOptimizer
+
+@brief class to capture a CUDA graph using a sequential stream
+
+A sequential capturing algorithm finds a topological order of
+the described graph and captures dependent GPU tasks using a single stream.
+All GPU tasks run sequentially without breaking inter dependencies.
+*/
+class cudaFlowSequentialOptimizer : public cudaFlowOptimizerBase {
+
+  friend class cudaFlowCapturer;
+
+  public:
+
+    /**
+    @brief constructs a sequential optimizer
+    */
+    cudaFlowSequentialOptimizer() = default;
+
+  private:
+
+    cudaGraph_t _optimize(cudaFlowGraph& graph);
+};
+
+inline cudaGraph_t cudaFlowSequentialOptimizer::_optimize(cudaFlowGraph& graph) {
+
+  // acquire per-thread stream and turn it into capture mode
+  // we must use ThreadLocal mode to avoid clashing with CUDA global states
+  
+  cudaStream stream;
+
+  stream.begin_capture(cudaStreamCaptureModeThreadLocal);
+
+  auto ordered = _toposort(graph);
+  for(auto node : ordered) {
+    std::get_if<cudaFlowNode::Capture>(&node->_handle)->work(stream);
+  }
+  
+  return stream.end_capture();
+}
+
+// ----------------------------------------------------------------------------
+// class definition: cudaFlowLinearOptimizer
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaFlowLinearOptimizer
+
+@brief class to capture a linear CUDA graph using a sequential stream
+
+A linear capturing algorithm is a special case of tf::cudaFlowSequentialOptimizer
+and assumes the input task graph to be a single linear chain of tasks
+(i.e., a straight line).
+This assumption allows faster optimization during the capturing process.
+If the input task graph is not a linear chain, the behavior is undefined.
+*/
+class cudaFlowLinearOptimizer : public cudaFlowOptimizerBase {
+
+  friend class cudaFlowCapturer;
+
+  public:
+
+    /**
+    @brief constructs a linear optimizer
+    */
+    cudaFlowLinearOptimizer() = default;
+
+  private:
+
+    cudaGraph_t _optimize(cudaFlowGraph& graph);
+};
+
+inline cudaGraph_t cudaFlowLinearOptimizer::_optimize(cudaFlowGraph& graph) {
+
+  // acquire per-thread stream and turn it into capture mode
+  // we must use ThreadLocal mode to avoid clashing with CUDA global states
+  cudaStream stream;
+
+  stream.begin_capture(cudaStreamCaptureModeThreadLocal);
+
+  // find the source node
+  cudaFlowNode* src {nullptr};
+  for(auto& u : graph._nodes) {
+    if(u->_dependents.size() == 0) {
+      src = u.get();
+      while(src) {
+        std::get_if<cudaFlowNode::Capture>(&src->_handle)->work(stream);
+        src = src->_successors.empty() ? nullptr : src->_successors[0];
+      }
+      break;
+    }
+    // ideally, there should be only one source
+  }
+
+  return stream.end_capture();
+}
+
+// ----------------------------------------------------------------------------
+// class definition: cudaFlowRoundRobinOptimizer
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaFlowRoundRobinOptimizer
+
+@brief class to capture a CUDA graph using a round-robin algorithm
+
+A round-robin capturing algorithm levelizes the user-described graph
+and assign streams to nodes in a round-robin order level by level.
+The algorithm is based on the following paper published in Euro-Par 2021:
+  + Dian-Lun Lin and Tsung-Wei Huang, &quot;Efficient GPU Computation using %Task Graph Parallelism,&quot; <i>European Conference on Parallel and Distributed Computing (Euro-Par)</i>, 2021
+
+The round-robin optimization algorithm is best suited for large %cudaFlow graphs
+that compose hundreds of or thousands of GPU operations
+(e.g., kernels and memory copies) with many of them being able to run in parallel.
+You can configure the number of streams to the optimizer to adjust the
+maximum kernel currency in the captured CUDA graph.
+*/
+class cudaFlowRoundRobinOptimizer : public cudaFlowOptimizerBase {
+
+  friend class cudaFlowCapturer;
+
+  public:
+
+    /**
+    @brief constructs a round-robin optimizer with 4 streams by default
+     */
+    cudaFlowRoundRobinOptimizer() = default;
+
+    /**
+    @brief constructs a round-robin optimizer with the given number of streams
+     */
+    explicit cudaFlowRoundRobinOptimizer(size_t num_streams);
+    
+    /**
+    @brief queries the number of streams used by the optimizer
+     */
+    size_t num_streams() const;
+
+    /**
+    @brief sets the number of streams used by the optimizer
+     */
+    void num_streams(size_t n);
+
+  private:
+
+    size_t _num_streams {4};
+
+    cudaGraph_t _optimize(cudaFlowGraph& graph);
+
+    void _reset(std::vector<std::vector<cudaFlowNode*>>& graph);
+
+};
+
+// Constructor
+inline cudaFlowRoundRobinOptimizer::cudaFlowRoundRobinOptimizer(size_t num_streams) :
+  _num_streams {num_streams} {
+
+  if(num_streams == 0) {
+    TF_THROW("number of streams must be at least one");
+  }
+}
+
+// Function: num_streams
+inline size_t cudaFlowRoundRobinOptimizer::num_streams() const {
+  return _num_streams;
+}
+
+// Procedure: num_streams
+inline void cudaFlowRoundRobinOptimizer::num_streams(size_t n) {
+  if(n == 0) {
+    TF_THROW("number of streams must be at least one");
+  }
+  _num_streams = n;
+}
+
+inline void cudaFlowRoundRobinOptimizer::_reset(
+  std::vector<std::vector<cudaFlowNode*>>& graph
+) {
+  //level == global id
+  //idx == stream id we want to skip
+  size_t id{0};
+  for(auto& each_level: graph) {
+    for(auto& node: each_level) {
+      auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle);
+      hn->level = id++;
+      hn->idx = _num_streams;
+      hn->event = nullptr;
+    }
+  }
+}
+
+// Function: _optimize
+inline cudaGraph_t cudaFlowRoundRobinOptimizer::_optimize(cudaFlowGraph& graph) {
+
+  // levelize the graph
+  auto levelized = _levelize(graph);
+
+  // initialize the data structure
+  _reset(levelized);
+
+  // begin to capture
+  std::vector<cudaStream> streams(_num_streams);
+
+  streams[0].begin_capture(cudaStreamCaptureModeThreadLocal);
+  
+  // reserve space for scoped events
+  std::vector<cudaEvent> events;
+  events.reserve((_num_streams >> 1) + levelized.size());
+
+  // fork
+  cudaEvent_t fork_event = events.emplace_back();
+  streams[0].record(fork_event);
+
+  for(size_t i = 1; i < streams.size(); ++i) {
+    streams[i].wait(fork_event);
+  }
+
+  // assign streams to levelized nodes in a round-robin manner
+  for(auto& each_level: levelized) {
+    for(auto& node: each_level) {
+      auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle);
+      size_t sid = hn->lid % _num_streams;
+
+      //wait events
+      cudaFlowNode* wait_node{nullptr};
+      for(auto& pn: node->_dependents) {
+        auto phn = std::get_if<cudaFlowNode::Capture>(&pn->_handle);
+        size_t psid = phn->lid % _num_streams;
+
+        //level == global id
+        //idx == stream id we want to skip
+        if(psid == hn->idx) {
+          if(wait_node == nullptr ||
+             std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->level < phn->level) {
+            wait_node = pn;
+          }
+        }
+        else if(psid != sid) {
+          streams[sid].wait(phn->event);
+        }
+      }
+
+      if(wait_node != nullptr) {
+        assert(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event); 
+        streams[sid].wait(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event);
+      }
+
+      //capture
+      hn->work(streams[sid]);
+
+      //create/record stream
+      for(auto& sn: node->_successors) {
+        auto shn = std::get_if<cudaFlowNode::Capture>(&sn->_handle);
+        size_t ssid = shn->lid % _num_streams;
+        if(ssid != sid) {
+          if(!hn->event) {
+            hn->event = events.emplace_back();
+            streams[sid].record(hn->event);
+          }
+          //idx == stream id we want to skip
+          shn->idx = sid;
+        }
+      }
+    }
+  }
+
+  // join
+  for(size_t i=1; i<_num_streams; ++i) {
+    cudaEvent_t join_event = events.emplace_back();
+    streams[i].record(join_event);
+    streams[0].wait(join_event);
+  }
+
+  return streams[0].end_capture();
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_stream.hpp b/myxpcs/include/taskflow_/cuda/cuda_stream.hpp
new file mode 100644
index 0000000..f3e48f1
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_stream.hpp
@@ -0,0 +1,226 @@
+#pragma once
+
+#include "cuda_object.hpp"
+
+/**
+@file cuda_stream.hpp
+@brief CUDA stream utilities include file
+*/
+
+namespace tf {
+
+
+
+// ----------------------------------------------------------------------------
+// cudaStream
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+struct cudaStreamCreator {
+  cudaStream_t operator () () const {
+    cudaStream_t stream;
+    TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream");
+    return stream;
+  }
+};
+
+/**
+@private
+*/
+struct cudaStreamDeleter {
+  void operator () (cudaStream_t stream) const {
+    if(stream) {
+      cudaStreamDestroy(stream);
+    }
+  }
+};
+
+/**
+@class cudaStream
+
+@brief class to create an RAII-styled wrapper over a native CUDA stream
+
+A cudaStream object is an RAII-styled wrapper over a native CUDA stream
+(@c cudaStream_t).
+A cudaStream object is move-only.
+*/
+class cudaStream : 
+
+  public cudaObject <cudaStream_t, cudaStreamCreator, cudaStreamDeleter> {
+  
+  public:
+
+    /**
+    @brief constructs an RAII-styled object from the given CUDA stream
+
+    Constructs a cudaStream object which owns @c stream.
+    */
+    explicit cudaStream(cudaStream_t stream) : cudaObject(stream) {
+    }
+    
+    /**
+    @brief default constructor
+    */
+    cudaStream() = default;
+    
+    /**
+    @brief synchronizes the associated stream
+
+    Equivalently calling @c cudaStreamSynchronize to block 
+    until this stream has completed all operations.
+    */
+    void synchronize() const {
+      TF_CHECK_CUDA(
+        cudaStreamSynchronize(object), "failed to synchronize a CUDA stream"
+      );
+    }
+    
+    /**
+    @brief begins graph capturing on the stream
+
+    When a stream is in capture mode, all operations pushed into the stream 
+    will not be executed, but will instead be captured into a graph, 
+    which will be returned via cudaStream::end_capture. 
+
+    A thread's mode can be one of the following:
+    + @c cudaStreamCaptureModeGlobal: This is the default mode. 
+      If the local thread has an ongoing capture sequence that was not initiated 
+      with @c cudaStreamCaptureModeRelaxed at @c cuStreamBeginCapture, 
+      or if any other thread has a concurrent capture sequence initiated with 
+      @c cudaStreamCaptureModeGlobal, this thread is prohibited from potentially 
+      unsafe API calls.
+
+    + @c cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture 
+      sequence not initiated with @c cudaStreamCaptureModeRelaxed, 
+      it is prohibited from potentially unsafe API calls. 
+      Concurrent capture sequences in other threads are ignored.
+
+    + @c cudaStreamCaptureModeRelaxed: The local thread is not prohibited 
+      from potentially unsafe API calls. Note that the thread is still prohibited 
+      from API calls which necessarily conflict with stream capture, for example, 
+      attempting @c cudaEventQuery on an event that was last recorded 
+      inside a capture sequence.
+    */
+    void begin_capture(cudaStreamCaptureMode m = cudaStreamCaptureModeGlobal) const {
+      TF_CHECK_CUDA(
+        cudaStreamBeginCapture(object, m), 
+        "failed to begin capture on stream ", object, " with thread mode ", m
+      );
+    }
+
+    /**
+    @brief ends graph capturing on the stream
+    
+    Equivalently calling @c cudaStreamEndCapture to
+    end capture on stream and returning the captured graph. 
+    Capture must have been initiated on stream via a call to cudaStream::begin_capture. 
+    If capture was invalidated, due to a violation of the rules of stream capture, 
+    then a NULL graph will be returned.
+    */
+    cudaGraph_t end_capture() const {
+      cudaGraph_t native_g;
+      TF_CHECK_CUDA(
+        cudaStreamEndCapture(object, &native_g), 
+        "failed to end capture on stream ", object
+      );
+      return native_g;
+    }
+    
+    /**
+    @brief records an event on the stream
+
+    Equivalently calling @c cudaEventRecord to record an event on this stream,
+    both of which must be on the same CUDA context.
+    */
+    void record(cudaEvent_t event) const {
+      TF_CHECK_CUDA(
+        cudaEventRecord(event, object), 
+        "failed to record event ", event, " on stream ", object
+      );
+    }
+    
+    /**
+    @brief waits on an event
+
+    Equivalently calling @c cudaStreamWaitEvent to make all future work 
+    submitted to stream wait for all work captured in event.
+    */
+    void wait(cudaEvent_t event) const {
+      TF_CHECK_CUDA(
+        cudaStreamWaitEvent(object, event, 0), 
+        "failed to wait for event ", event, " on stream ", object
+      );
+    }
+};
+
+// ----------------------------------------------------------------------------
+// cudaEvent
+// ----------------------------------------------------------------------------
+  
+/**
+@private
+*/
+struct cudaEventCreator {
+
+  cudaEvent_t operator () () const {
+    cudaEvent_t event;
+    TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event");
+    return event;
+  }
+  
+  cudaEvent_t operator () (unsigned int flag) const {
+    cudaEvent_t event;
+    TF_CHECK_CUDA(
+      cudaEventCreateWithFlags(&event, flag),
+      "failed to create a CUDA event with flag=", flag
+    );
+    return event;
+  }
+};
+
+/**
+@private
+*/
+struct cudaEventDeleter {
+  void operator () (cudaEvent_t event) const {
+    cudaEventDestroy(event);
+  }
+};
+
+/**
+@class cudaEvent
+
+@brief class to create an RAII-styled wrapper over a native CUDA event
+
+A cudaEvent object is an RAII-styled wrapper over a native CUDA event 
+(@c cudaEvent_t).
+A cudaEvent object is move-only.
+*/
+class cudaEvent :
+  public cudaObject<cudaEvent_t, cudaEventCreator, cudaEventDeleter> {
+
+  public:
+
+    /**
+    @brief constructs an RAII-styled CUDA event object from the given CUDA event
+    */
+    explicit cudaEvent(cudaEvent_t event) : cudaObject(event) { }   
+
+    /**
+    @brief constructs an RAII-styled CUDA event object
+    */
+    cudaEvent() = default;
+    
+    /**
+    @brief constructs an RAII-styled CUDA event object with the given flag
+    */
+    explicit cudaEvent(unsigned int flag) : cudaObject(cudaEventCreator{}(flag)) { }
+};
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cuda_task.hpp b/myxpcs/include/taskflow_/cuda/cuda_task.hpp
new file mode 100644
index 0000000..92fac9c
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cuda_task.hpp
@@ -0,0 +1,274 @@
+#pragma once
+
+#include "cuda_graph.hpp"
+
+/**
+@file cuda_task.hpp
+@brief cudaTask include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// cudaTask Types
+// ----------------------------------------------------------------------------
+
+/**
+@enum cudaTaskType
+
+@brief enumeration of all %cudaTask types
+*/
+enum class cudaTaskType : int {
+  /** @brief empty task type */
+  EMPTY = 0,
+  /** @brief host task type */
+  HOST,
+  /** @brief memory set task type */
+  MEMSET,
+  /** @brief memory copy task type */
+  MEMCPY,
+  /** @brief memory copy task type */
+  KERNEL,
+  /** @brief subflow (child graph) task type */
+  SUBFLOW,
+  /** @brief capture task type */
+  CAPTURE,
+  /** @brief undefined task type */
+  UNDEFINED
+};
+
+/**
+@brief convert a cuda_task type to a human-readable string
+*/
+constexpr const char* to_string(cudaTaskType type) {
+  switch(type) {
+    case cudaTaskType::EMPTY:   return "empty";
+    case cudaTaskType::HOST:    return "host";
+    case cudaTaskType::MEMSET:  return "memset";
+    case cudaTaskType::MEMCPY:  return "memcpy";
+    case cudaTaskType::KERNEL:  return "kernel";
+    case cudaTaskType::SUBFLOW: return "subflow";
+    case cudaTaskType::CAPTURE: return "capture";
+    default:                    return "undefined";
+  }
+}
+
+// ----------------------------------------------------------------------------
+// cudaTask
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaTask
+
+@brief class to create a task handle over an internal node of a %cudaFlow graph
+*/
+class cudaTask {
+
+  friend class cudaFlow;
+  friend class cudaFlowCapturer;
+  friend class cudaFlowCapturerBase;
+
+  friend std::ostream& operator << (std::ostream&, const cudaTask&);
+
+  public:
+
+    /**
+    @brief constructs an empty cudaTask
+    */
+    cudaTask() = default;
+
+    /**
+    @brief copy-constructs a cudaTask
+    */
+    cudaTask(const cudaTask&) = default;
+
+    /**
+    @brief copy-assigns a cudaTask
+    */
+    cudaTask& operator = (const cudaTask&) = default;
+
+    /**
+    @brief adds precedence links from this to other tasks
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    cudaTask& precede(Ts&&... tasks);
+
+    /**
+    @brief adds precedence links from other tasks to this
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    cudaTask& succeed(Ts&&... tasks);
+
+    /**
+    @brief assigns a name to the task
+
+    @param name a @std_string acceptable string
+
+    @return @c *this
+    */
+    cudaTask& name(const std::string& name);
+
+    /**
+    @brief queries the name of the task
+    */
+    const std::string& name() const;
+
+    /**
+    @brief queries the number of successors
+    */
+    size_t num_successors() const;
+
+    /**
+    @brief queries the number of dependents
+    */
+    size_t num_dependents() const;
+
+    /**
+    @brief queries if the task is associated with a cudaFlowNode
+    */
+    bool empty() const;
+
+    /**
+    @brief queries the task type
+    */
+    cudaTaskType type() const;
+
+    /**
+    @brief dumps the task through an output stream
+
+    @tparam T output stream type with insertion operator (<<) defined
+    @param ostream an output stream target
+    */
+    template <typename T>
+    void dump(T& ostream) const;
+
+    /**
+    @brief applies an visitor callable to each successor of the task
+    */
+    template <typename V>
+    void for_each_successor(V&& visitor) const;
+
+    /**
+    @brief applies an visitor callable to each dependents of the task
+    */
+    template <typename V>
+    void for_each_dependent(V&& visitor) const;
+
+  private:
+
+    cudaTask(cudaFlowNode*);
+
+    cudaFlowNode* _node {nullptr};
+};
+
+// Constructor
+inline cudaTask::cudaTask(cudaFlowNode* node) : _node {node} {
+}
+
+// Function: precede
+template <typename... Ts>
+cudaTask& cudaTask::precede(Ts&&... tasks) {
+  (_node->_precede(tasks._node), ...);
+  return *this;
+}
+
+// Function: succeed
+template <typename... Ts>
+cudaTask& cudaTask::succeed(Ts&&... tasks) {
+  (tasks._node->_precede(_node), ...);
+  return *this;
+}
+
+// Function: empty
+inline bool cudaTask::empty() const {
+  return _node == nullptr;
+}
+
+// Function: name
+inline cudaTask& cudaTask::name(const std::string& name) {
+  _node->_name = name;
+  return *this;
+}
+
+// Function: name
+inline const std::string& cudaTask::name() const {
+  return _node->_name;
+}
+
+// Function: num_successors
+inline size_t cudaTask::num_successors() const {
+  return _node->_successors.size();
+}
+
+// Function: num_dependents
+inline size_t cudaTask::num_dependents() const {
+  return _node->_dependents.size();
+}
+
+// Function: type
+inline cudaTaskType cudaTask::type() const {
+  switch(_node->_handle.index()) {
+    case cudaFlowNode::EMPTY:   return cudaTaskType::EMPTY;
+    case cudaFlowNode::HOST:    return cudaTaskType::HOST;
+    case cudaFlowNode::MEMSET:  return cudaTaskType::MEMSET;
+    case cudaFlowNode::MEMCPY:  return cudaTaskType::MEMCPY;
+    case cudaFlowNode::KERNEL:  return cudaTaskType::KERNEL;
+    case cudaFlowNode::SUBFLOW: return cudaTaskType::SUBFLOW;
+    case cudaFlowNode::CAPTURE: return cudaTaskType::CAPTURE;
+    default:                return cudaTaskType::UNDEFINED;
+  }
+}
+
+// Procedure: dump
+template <typename T>
+void cudaTask::dump(T& os) const {
+  os << "cudaTask ";
+  if(_node->_name.empty()) os << _node;
+  else os << _node->_name;
+  os << " [type=" << to_string(type()) << ']';
+}
+
+// Function: for_each_successor
+template <typename V>
+void cudaTask::for_each_successor(V&& visitor) const {
+  for(size_t i=0; i<_node->_successors.size(); ++i) {
+    visitor(cudaTask(_node->_successors[i]));
+  }
+}
+
+// Function: for_each_dependent
+template <typename V>
+void cudaTask::for_each_dependent(V&& visitor) const {
+  for(size_t i=0; i<_node->_dependents.size(); ++i) {
+    visitor(cudaTask(_node->_dependents[i]));
+  }
+}
+
+// ----------------------------------------------------------------------------
+// global ostream
+// ----------------------------------------------------------------------------
+
+/**
+@brief overload of ostream inserter operator for cudaTask
+*/
+inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) {
+  ct.dump(os);
+  return os;
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/cuda/cudaflow.hpp b/myxpcs/include/taskflow_/cuda/cudaflow.hpp
new file mode 100644
index 0000000..61d5c84
--- /dev/null
+++ b/myxpcs/include/taskflow_/cuda/cudaflow.hpp
@@ -0,0 +1,1024 @@
+#pragma once
+
+#include "../taskflow.hpp"
+#include "cuda_task.hpp"
+#include "cuda_capturer.hpp"
+
+/**
+@file taskflow/cuda/cudaflow.hpp
+@brief cudaFlow include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// class definition: cudaFlow
+// ----------------------------------------------------------------------------
+
+/**
+@class cudaFlow
+
+@brief class to create a %cudaFlow task dependency graph
+
+A %cudaFlow is a high-level interface over CUDA Graph to perform GPU operations
+using the task dependency graph model.
+The class provides a set of methods for creating and launch different tasks
+on one or multiple CUDA devices,
+for instance, kernel tasks, data transfer tasks, and memory operation tasks.
+The following example creates a %cudaFlow of two kernel tasks, @c task1 and
+@c task2, where @c task1 runs before @c task2.
+
+@code{.cpp}
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+taskflow.emplace([&](tf::cudaFlow& cf){
+  // create two kernel tasks
+  tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1);
+  tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2);
+
+  // kernel1 runs before kernel2
+  task1.precede(task2);
+});
+
+executor.run(taskflow).wait();
+@endcode
+
+A %cudaFlow is a task (tf::Task) created from tf::Taskflow
+and will be run by @em one worker thread in the executor.
+That is, the callable that describes a %cudaFlow
+will be executed sequentially.
+Inside a %cudaFlow task, different GPU tasks (tf::cudaTask) may run
+in parallel scheduled by the CUDA runtime.
+
+Please refer to @ref GPUTaskingcudaFlow for details.
+*/
+class cudaFlow {
+  
+  public:
+
+    /**
+    @brief constructs a %cudaFlow
+    */
+    cudaFlow();
+
+    /**
+    @brief destroys the %cudaFlow and its associated native CUDA graph
+           and executable graph
+     */
+    ~cudaFlow() = default;
+
+    /**
+    @brief default move constructor
+    */
+    cudaFlow(cudaFlow&&) = default;
+    
+    /**
+    @brief default move assignment operator
+    */
+    cudaFlow& operator = (cudaFlow&&) = default;
+
+    /**
+    @brief queries the emptiness of the graph
+    */
+    bool empty() const;
+
+    /**
+    @brief queries the number of tasks
+    */
+    size_t num_tasks() const;
+
+    /**
+    @brief clears the %cudaFlow object
+    */
+    void clear();
+
+    /**
+    @brief dumps the %cudaFlow graph into a DOT format through an
+           output stream
+    */
+    void dump(std::ostream& os) const;
+
+    /**
+    @brief dumps the native CUDA graph into a DOT format through an
+           output stream
+
+    The native CUDA graph may be different from the upper-level %cudaFlow
+    graph when flow capture is involved.
+    */
+    void dump_native_graph(std::ostream& os) const;
+
+    // ------------------------------------------------------------------------
+    // Graph building routines
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief creates a no-operation task
+
+    @return a tf::cudaTask handle
+
+    An empty node performs no operation during execution,
+    but can be used for transitive ordering.
+    For example, a phased execution graph with 2 groups of @c n nodes
+    with a barrier between them can be represented using an empty node
+    and @c 2*n dependency edges,
+    rather than no empty node and @c n^2 dependency edges.
+    */
+    cudaTask noop();
+
+    /**
+    @brief creates a host task that runs a callable on the host
+
+    @tparam C callable type
+
+    @param callable a callable object with neither arguments nor return
+    (i.e., constructible from @c std::function<void()>)
+
+    @return a tf::cudaTask handle
+
+    A host task can only execute CPU-specific functions and cannot do any CUDA calls
+    (e.g., @c cudaMalloc).
+    */
+    template <typename C>
+    cudaTask host(C&& callable);
+
+    /**
+    @brief updates parameters of a host task
+
+    The method is similar to tf::cudaFlow::host but operates on a task
+    of type tf::cudaTaskType::HOST.
+    */
+    template <typename C>
+    void host(cudaTask task, C&& callable);
+
+    /**
+    @brief creates a kernel task
+
+    @tparam F kernel function type
+    @tparam ArgsT kernel function parameters type
+
+    @param g configured grid
+    @param b configured block
+    @param s configured shared memory size in bytes
+    @param f kernel function
+    @param args arguments to forward to the kernel function by copy
+
+    @return a tf::cudaTask handle
+    */
+    template <typename F, typename... ArgsT>
+    cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args);
+
+    /**
+    @brief updates parameters of a kernel task
+
+    The method is similar to tf::cudaFlow::kernel but operates on a task
+    of type tf::cudaTaskType::KERNEL.
+    The kernel function name must NOT change.
+    */
+    template <typename F, typename... ArgsT>
+    void kernel(
+      cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args
+    );
+
+    /**
+    @brief creates a memset task that fills untyped data with a byte value
+
+    @param dst pointer to the destination device memory area
+    @param v value to set for each byte of specified memory
+    @param count size in bytes to set
+
+    @return a tf::cudaTask handle
+
+    A memset task fills the first @c count bytes of device memory area
+    pointed by @c dst with the byte value @c v.
+    */
+    cudaTask memset(void* dst, int v, size_t count);
+
+    /**
+    @brief updates parameters of a memset task
+
+    The method is similar to tf::cudaFlow::memset but operates on a task
+    of type tf::cudaTaskType::MEMSET.
+    The source/destination memory may have different address values but
+    must be allocated from the same contexts as the original
+    source/destination memory.
+    */
+    void memset(cudaTask task, void* dst, int ch, size_t count);
+
+    /**
+    @brief creates a memcpy task that copies untyped data in bytes
+
+    @param tgt pointer to the target memory block
+    @param src pointer to the source memory block
+    @param bytes bytes to copy
+
+    @return a tf::cudaTask handle
+
+    A memcpy task transfers @c bytes of data from a source location
+    to a target location. Direction can be arbitrary among CPUs and GPUs.
+    */
+    cudaTask memcpy(void* tgt, const void* src, size_t bytes);
+
+    /**
+    @brief updates parameters of a memcpy task
+
+    The method is similar to tf::cudaFlow::memcpy but operates on a task
+    of type tf::cudaTaskType::MEMCPY.
+    The source/destination memory may have different address values but
+    must be allocated from the same contexts as the original
+    source/destination memory.
+    */
+    void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes);
+
+    /**
+    @brief creates a memset task that sets a typed memory block to zero
+
+    @tparam T element type (size of @c T must be either 1, 2, or 4)
+    @param dst pointer to the destination device memory area
+    @param count number of elements
+
+    @return a tf::cudaTask handle
+
+    A zero task zeroes the first @c count elements of type @c T
+    in a device memory area pointed by @c dst.
+    */
+    template <typename T, std::enable_if_t<
+      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+    >
+    cudaTask zero(T* dst, size_t count);
+
+    /**
+    @brief updates parameters of a memset task to a zero task
+
+    The method is similar to tf::cudaFlow::zero but operates on
+    a task of type tf::cudaTaskType::MEMSET.
+
+    The source/destination memory may have different address values but
+    must be allocated from the same contexts as the original
+    source/destination memory.
+    */
+    template <typename T, std::enable_if_t<
+      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+    >
+    void zero(cudaTask task, T* dst, size_t count);
+
+    /**
+    @brief creates a memset task that fills a typed memory block with a value
+
+    @tparam T element type (size of @c T must be either 1, 2, or 4)
+
+    @param dst pointer to the destination device memory area
+    @param value value to fill for each element of type @c T
+    @param count number of elements
+
+    @return a tf::cudaTask handle
+
+    A fill task fills the first @c count elements of type @c T with @c value
+    in a device memory area pointed by @c dst.
+    The value to fill is interpreted in type @c T rather than byte.
+    */
+    template <typename T, std::enable_if_t<
+      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+    >
+    cudaTask fill(T* dst, T value, size_t count);
+
+    /**
+    @brief updates parameters of a memset task to a fill task
+
+    The method is similar to tf::cudaFlow::fill but operates on a task
+    of type tf::cudaTaskType::MEMSET.
+
+    The source/destination memory may have different address values but
+    must be allocated from the same contexts as the original
+    source/destination memory.
+    */
+    template <typename T, std::enable_if_t<
+      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
+    >
+    void fill(cudaTask task, T* dst, T value, size_t count);
+
+    /**
+    @brief creates a memcopy task that copies typed data
+
+    @tparam T element type (non-void)
+
+    @param tgt pointer to the target memory block
+    @param src pointer to the source memory block
+    @param num number of elements to copy
+
+    @return a tf::cudaTask handle
+
+    A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location
+    to a target location. Direction can be arbitrary among CPUs and GPUs.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    cudaTask copy(T* tgt, const T* src, size_t num);
+
+    /**
+    @brief updates parameters of a memcpy task to a copy task
+
+    The method is similar to tf::cudaFlow::copy but operates on a task
+    of type tf::cudaTaskType::MEMCPY.
+    The source/destination memory may have different address values but
+    must be allocated from the same contexts as the original
+    source/destination memory.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    void copy(cudaTask task, T* tgt, const T* src, size_t num);
+
+    // ------------------------------------------------------------------------
+    // run method
+    // ------------------------------------------------------------------------
+    /**
+    @brief offloads the %cudaFlow onto a GPU asynchronously via a stream
+
+    @param stream stream for performing this operation
+
+    Offloads the present %cudaFlow onto a GPU asynchronously via
+    the given stream.
+
+    An offloaded %cudaFlow forces the underlying graph to be instantiated.
+    After the instantiation, you should not modify the graph topology
+    but update node parameters.
+    */
+    void run(cudaStream_t stream);
+
+    /**
+    @brief acquires a reference to the underlying CUDA graph
+    */
+    cudaGraph_t native_graph();
+
+    /**
+    @brief acquires a reference to the underlying CUDA graph executable
+    */
+    cudaGraphExec_t native_executable();
+
+    // ------------------------------------------------------------------------
+    // generic algorithms
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief runs a callable with only a single kernel thread
+
+    @tparam C callable type
+
+    @param c callable to run by a single kernel thread
+
+    @return a tf::cudaTask handle
+    */
+    template <typename C>
+    cudaTask single_task(C c);
+
+    /**
+    @brief updates a single-threaded kernel task
+
+    This method is similar to cudaFlow::single_task but operates
+    on an existing task.
+    */
+    template <typename C>
+    void single_task(cudaTask task, C c);
+
+    /**
+    @brief applies a callable to each dereferenced element of the data array
+
+    @tparam I iterator type
+    @tparam C callable type
+
+    @param first iterator to the beginning (inclusive)
+    @param last iterator to the end (exclusive)
+    @param callable a callable object to apply to the dereferenced iterator
+
+    @return a tf::cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    for(auto itr = first; itr != last; itr++) {
+      callable(*itr);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    cudaTask for_each(I first, I last, C callable);
+
+    /**
+    @brief updates parameters of a kernel task created from
+           tf::cudaFlow::for_each
+
+    The type of the iterators and the callable must be the same as
+    the task created from tf::cudaFlow::for_each.
+    */
+    template <typename I, typename C>
+    void for_each(cudaTask task, I first, I last, C callable);
+
+    /**
+    @brief applies a callable to each index in the range with the step size
+
+    @tparam I index type
+    @tparam C callable type
+
+    @param first beginning index
+    @param last last index
+    @param step step size
+    @param callable the callable to apply to each element in the data array
+
+    @return a tf::cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    // step is positive [first, last)
+    for(auto i=first; i<last; i+=step) {
+      callable(i);
+    }
+
+    // step is negative [first, last)
+    for(auto i=first; i>last; i+=step) {
+      callable(i);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    cudaTask for_each_index(I first, I last, I step, C callable);
+
+    /**
+    @brief updates parameters of a kernel task created from
+           tf::cudaFlow::for_each_index
+
+    The type of the iterators and the callable must be the same as
+    the task created from tf::cudaFlow::for_each_index.
+    */
+    template <typename I, typename C>
+    void for_each_index(
+      cudaTask task, I first, I last, I step, C callable
+    );
+
+    /**
+    @brief applies a callable to a source range and stores the result in a target range
+
+    @tparam I input iterator type
+    @tparam O output iterator type
+    @tparam C unary operator type
+
+    @param first iterator to the beginning of the input range
+    @param last iterator to the end of the input range
+    @param output iterator to the beginning of the output range
+    @param op the operator to apply to transform each element in the range
+
+    @return a tf::cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    while (first != last) {
+      *output++ = callable(*first++);
+    }
+    @endcode
+    */
+    template <typename I, typename O, typename C>
+    cudaTask transform(I first, I last, O output, C op);
+
+    /**
+    @brief updates parameters of a kernel task created from
+           tf::cudaFlow::transform
+
+    The type of the iterators and the callable must be the same as
+    the task created from tf::cudaFlow::for_each.
+    */
+    template <typename I, typename O, typename C>
+    void transform(cudaTask task, I first, I last, O output, C c);
+
+    /**
+    @brief creates a task to perform parallel transforms over two ranges of items
+
+    @tparam I1 first input iterator type
+    @tparam I2 second input iterator type
+    @tparam O output iterator type
+    @tparam C unary operator type
+
+    @param first1 iterator to the beginning of the input range
+    @param last1 iterator to the end of the input range
+    @param first2 iterato
+    @param output iterator to the beginning of the output range
+    @param op binary operator to apply to transform each pair of items in the
+              two input ranges
+
+    @return cudaTask handle
+
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+
+    @code{.cpp}
+    while (first1 != last1) {
+      *output++ = op(*first1++, *first2++);
+    }
+    @endcode
+    */
+    template <typename I1, typename I2, typename O, typename C>
+    cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
+
+    /**
+    @brief updates parameters of a kernel task created from
+           tf::cudaFlow::transform
+
+    The type of the iterators and the callable must be the same as
+    the task created from tf::cudaFlow::for_each.
+    */
+    template <typename I1, typename I2, typename O, typename C>
+    void transform(
+      cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
+    );
+
+    // ------------------------------------------------------------------------
+    // subflow
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief constructs a subflow graph through tf::cudaFlowCapturer
+
+    @tparam C callable type constructible from
+              @c std::function<void(tf::cudaFlowCapturer&)>
+
+    @param callable the callable to construct a capture flow
+
+    @return a tf::cudaTask handle
+
+    A captured subflow forms a sub-graph to the %cudaFlow and can be used to
+    capture custom (or third-party) kernels that cannot be directly constructed
+    from the %cudaFlow.
+
+    Example usage:
+
+    @code{.cpp}
+    taskflow.emplace([&](tf::cudaFlow& cf){
+
+      tf::cudaTask my_kernel = cf.kernel(my_arguments);
+
+      // create a flow capturer to capture custom kernels
+      tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){
+        capturer.on([&](cudaStream_t stream){
+          invoke_custom_kernel_with_stream(stream, custom_arguments);
+        });
+      });
+
+      my_kernel.precede(my_subflow);
+    });
+    @endcode
+    */
+    template <typename C>
+    cudaTask capture(C&& callable);
+
+    /**
+    @brief updates the captured child graph
+
+    The method is similar to tf::cudaFlow::capture but operates on a task
+    of type tf::cudaTaskType::SUBFLOW.
+    The new captured graph must be topologically identical to the original
+    captured graph.
+    */
+    template <typename C>
+    void capture(cudaTask task, C callable);
+
+  private:
+
+    cudaFlowGraph _cfg;
+    cudaGraphExec _exe {nullptr};
+};
+
+// Construct a standalone cudaFlow
+inline cudaFlow::cudaFlow() {
+  _cfg._native_handle.create();
+}
+
+// Procedure: clear
+inline void cudaFlow::clear() {
+  _exe.clear();
+  _cfg.clear();
+  _cfg._native_handle.create();
+}
+
+// Function: empty
+inline bool cudaFlow::empty() const {
+  return _cfg._nodes.empty();
+}
+
+// Function: num_tasks
+inline size_t cudaFlow::num_tasks() const {
+  return _cfg._nodes.size();
+}
+
+// Procedure: dump
+inline void cudaFlow::dump(std::ostream& os) const {
+  _cfg.dump(os, nullptr, "");
+}
+
+// Procedure: dump
+inline void cudaFlow::dump_native_graph(std::ostream& os) const {
+  cuda_dump_graph(os, _cfg._native_handle);
+}
+
+// ----------------------------------------------------------------------------
+// Graph building methods
+// ----------------------------------------------------------------------------
+
+// Function: noop
+inline cudaTask cudaFlow::noop() {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Empty>{}
+  );
+
+  TF_CHECK_CUDA(
+    cudaGraphAddEmptyNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0
+    ),
+    "failed to create a no-operation (empty) node"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: host
+template <typename C>
+cudaTask cudaFlow::host(C&& c) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Host>{}, std::forward<C>(c)
+  );
+
+  auto h = std::get_if<cudaFlowNode::Host>(&node->_handle);
+
+  cudaHostNodeParams p;
+  p.fn = cudaFlowNode::Host::callback;
+  p.userData = h;
+
+  TF_CHECK_CUDA(
+    cudaGraphAddHostNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a host node"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: kernel
+template <typename F, typename... ArgsT>
+cudaTask cudaFlow::kernel(
+  dim3 g, dim3 b, size_t s, F f, ArgsT... args
+) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Kernel>{}, (void*)f
+  );
+
+  cudaKernelNodeParams p;
+  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
+  p.func = (void*)f;
+  p.gridDim = g;
+  p.blockDim = b;
+  p.sharedMemBytes = s;
+  p.kernelParams = arguments;
+  p.extra = nullptr;
+
+  TF_CHECK_CUDA(
+    cudaGraphAddKernelNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a kernel task"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: zero
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
+>
+cudaTask cudaFlow::zero(T* dst, size_t count) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Memset>{}
+  );
+
+  auto p = cuda_get_zero_parms(dst, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddMemsetNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a memset (zero) task"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: fill
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
+>
+cudaTask cudaFlow::fill(T* dst, T value, size_t count) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Memset>{}
+  );
+
+  auto p = cuda_get_fill_parms(dst, value, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddMemsetNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a memset (fill) task"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: copy
+template <
+  typename T,
+  std::enable_if_t<!std::is_same_v<T, void>, void>*
+>
+cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Memcpy>{}
+  );
+
+  auto p = cuda_get_copy_parms(tgt, src, num);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddMemcpyNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a memcpy (copy) task"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: memset
+inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Memset>{}
+  );
+
+  auto p = cuda_get_memset_parms(dst, ch, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddMemsetNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a memset task"
+  );
+
+  return cudaTask(node);
+}
+
+// Function: memcpy
+inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
+
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Memcpy>{}
+  );
+
+  auto p = cuda_get_memcpy_parms(tgt, src, bytes);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddMemcpyNode(
+      &node->_native_handle, _cfg._native_handle, nullptr, 0, &p
+    ),
+    "failed to create a memcpy task"
+  );
+
+  return cudaTask(node);
+}
+
+// ------------------------------------------------------------------------
+// update methods
+// ------------------------------------------------------------------------
+
+// Function: host
+template <typename C>
+void cudaFlow::host(cudaTask task, C&& c) {
+
+  if(task.type() != cudaTaskType::HOST) {
+    TF_THROW(task, " is not a host task");
+  }
+
+  auto h = std::get_if<cudaFlowNode::Host>(&task._node->_handle);
+
+  h->func = std::forward<C>(c);
+}
+
+// Function: update kernel parameters
+template <typename F, typename... ArgsT>
+void cudaFlow::kernel(
+  cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT... args
+) {
+
+  if(task.type() != cudaTaskType::KERNEL) {
+    TF_THROW(task, " is not a kernel task");
+  }
+
+  cudaKernelNodeParams p;
+
+  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
+  p.func = (void*)f;
+  p.gridDim = g;
+  p.blockDim = b;
+  p.sharedMemBytes = s;
+  p.kernelParams = arguments;
+  p.extra = nullptr;
+
+  TF_CHECK_CUDA(
+    cudaGraphExecKernelNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update kernel parameters on ", task
+  );
+}
+
+// Function: update copy parameters
+template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
+void cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num) {
+
+  if(task.type() != cudaTaskType::MEMCPY) {
+    TF_THROW(task, " is not a memcpy task");
+  }
+
+  auto p = cuda_get_copy_parms(tgt, src, num);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update memcpy parameters on ", task
+  );
+}
+
+// Function: update memcpy parameters
+inline void cudaFlow::memcpy(
+  cudaTask task, void* tgt, const void* src, size_t bytes
+) {
+
+  if(task.type() != cudaTaskType::MEMCPY) {
+    TF_THROW(task, " is not a memcpy task");
+  }
+
+  auto p = cuda_get_memcpy_parms(tgt, src, bytes);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update memcpy parameters on ", task
+  );
+}
+
+// Procedure: memset
+inline void cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count) {
+
+  if(task.type() != cudaTaskType::MEMSET) {
+    TF_THROW(task, " is not a memset task");
+  }
+
+  auto p = cuda_get_memset_parms(dst, ch, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update memset parameters on ", task
+  );
+}
+
+// Procedure: fill
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
+>
+void cudaFlow::fill(cudaTask task, T* dst, T value, size_t count) {
+
+  if(task.type() != cudaTaskType::MEMSET) {
+    TF_THROW(task, " is not a memset task");
+  }
+
+  auto p = cuda_get_fill_parms(dst, value, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update memset parameters on ", task
+  );
+}
+
+// Procedure: zero
+template <typename T, std::enable_if_t<
+  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
+>
+void cudaFlow::zero(cudaTask task, T* dst, size_t count) {
+
+  if(task.type() != cudaTaskType::MEMSET) {
+    TF_THROW(task, " is not a memset task");
+  }
+
+  auto p = cuda_get_zero_parms(dst, count);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p),
+    "failed to update memset parameters on ", task
+  );
+}
+
+// Function: capture
+template <typename C>
+void cudaFlow::capture(cudaTask task, C c) {
+
+  if(task.type() != cudaTaskType::SUBFLOW) {
+    TF_THROW(task, " is not a subflow task");
+  }
+
+  // insert a subflow node
+  // construct a captured flow from the callable
+  auto node_handle = std::get_if<cudaFlowNode::Subflow>(&task._node->_handle);
+  //node_handle->graph.clear();
+
+  cudaFlowCapturer capturer;
+  c(capturer);
+
+  // obtain the optimized captured graph
+  capturer._cfg._native_handle.reset(capturer.capture());
+  node_handle->cfg = std::move(capturer._cfg);
+
+  TF_CHECK_CUDA(
+    cudaGraphExecChildGraphNodeSetParams(
+      _exe, 
+      task._node->_native_handle, 
+      node_handle->cfg._native_handle
+    ),
+    "failed to update a captured child graph"
+  );
+}
+
+// ----------------------------------------------------------------------------
+// captured flow
+// ----------------------------------------------------------------------------
+
+// Function: capture
+template <typename C>
+cudaTask cudaFlow::capture(C&& c) {
+
+  // insert a subflow node
+  auto node = _cfg.emplace_back(
+    _cfg, std::in_place_type_t<cudaFlowNode::Subflow>{}
+  );
+
+  // construct a captured flow from the callable
+  auto node_handle = std::get_if<cudaFlowNode::Subflow>(&node->_handle);
+
+  // perform capturing
+  cudaFlowCapturer capturer;
+  c(capturer);
+
+  // obtain the optimized captured graph
+  capturer._cfg._native_handle.reset(capturer.capture());
+
+  // move capturer's cudaFlow graph into node
+  node_handle->cfg = std::move(capturer._cfg);
+
+  TF_CHECK_CUDA(
+    cudaGraphAddChildGraphNode(
+      &node->_native_handle, 
+      _cfg._native_handle, 
+      nullptr, 
+      0, 
+      node_handle->cfg._native_handle
+    ), 
+    "failed to add a cudaFlow capturer task"
+  );
+
+  return cudaTask(node);
+}
+
+// ----------------------------------------------------------------------------
+// run method
+// ----------------------------------------------------------------------------
+
+// Procedure: run
+inline void cudaFlow::run(cudaStream_t stream) {
+  if(!_exe) {
+    _exe.instantiate(_cfg._native_handle);
+  }
+  _exe.launch(stream);
+  _cfg._state = cudaFlowGraph::OFFLOADED;
+}
+
+// Function: native_cfg
+inline cudaGraph_t cudaFlow::native_graph() {
+  return _cfg._native_handle;
+}
+
+// Function: native_executable
+inline cudaGraphExec_t cudaFlow::native_executable() {
+  return _exe;
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/dsl/connection.hpp b/myxpcs/include/taskflow_/dsl/connection.hpp
new file mode 100644
index 0000000..e4dad72
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/connection.hpp
@@ -0,0 +1,53 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include "../core/flow_builder.hpp"
+#include "task_trait.hpp"
+#include "tuple_utils.hpp"
+#include "type_list.hpp"
+
+namespace tf {
+namespace dsl {
+template <typename F, typename T> class Connection {
+  using FROMs = typename TaskTrait<F>::TaskList;
+  using TOs = typename TaskTrait<T>::TaskList;
+
+public:
+  using FromTaskList = Unique_t<Flatten_t<FROMs>>;
+  using ToTaskList = Unique_t<Flatten_t<TOs>>;
+};
+
+template <typename T, typename OUT = TypeList<>> struct Chain;
+
+template <typename F, typename OUT> struct Chain<auto (*)(F)->void, OUT> {
+  using From = F;
+  using type = OUT;
+};
+
+template <typename F, typename T, typename OUT>
+struct Chain<auto (*)(F)->T, OUT> {
+private:
+  using To = typename Chain<T, OUT>::From;
+
+public:
+  using From = F;
+  using type = typename Chain<
+      T, typename OUT::template appendTo<Connection<From, To>>>::type;
+};
+
+template <typename FROM, typename TO> struct OneToOneLink {
+  template <typename TasksCB> struct InstanceType {
+    constexpr void build(TasksCB &tasksCb) {
+      constexpr size_t TasksCBSize = std::tuple_size<TasksCB>::value;
+      constexpr size_t FromTaskIndex =
+          TupleElementByF_v<TasksCB, IsTask<FROM>::template apply>;
+      constexpr size_t ToTaskIndex =
+          TupleElementByF_v<TasksCB, IsTask<TO>::template apply>;
+      static_assert(FromTaskIndex < TasksCBSize && ToTaskIndex < TasksCBSize,
+                    "fatal: not find TaskCb in TasksCB");
+      std::get<FromTaskIndex>(tasksCb).task_.precede(
+          std::get<ToTaskIndex>(tasksCb).task_);
+    }
+  };
+};
+} // namespace dsl
+}; // namespace tf
diff --git a/myxpcs/include/taskflow_/dsl/dsl.hpp b/myxpcs/include/taskflow_/dsl/dsl.hpp
new file mode 100644
index 0000000..e4130e8
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/dsl.hpp
@@ -0,0 +1,13 @@
+// TaskflowDSL is an experimental project that leverages C++17 to
+// provide a dedicated interface for expressive taskflow programming
+//
+// Created by netcan: https://github.com/netcan
+
+#pragma once
+
+#include "dsl/task_dsl.hpp"
+
+namespace tf {
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/dsl/meta_macro.hpp b/myxpcs/include/taskflow_/dsl/meta_macro.hpp
new file mode 100644
index 0000000..758bf68
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/meta_macro.hpp
@@ -0,0 +1,72 @@
+// 2020/08/30 - Created by netcan: https://github.com/netcan
+// ref https://github.com/Erlkoenig90/map-macro/
+#pragma once
+#ifdef _MSC_VER
+#define TF_EMPTY()
+#define TF_GET_ARG_COUNT_(...)                                                 \
+  TF_PASTE(TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, \
+                              55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,  \
+                              43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,  \
+                              31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20,  \
+                              19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, \
+                              6, 5, 4, 3, 2, 1, 0, ),                          \
+           TF_EMPTY())
+
+#else
+#define TF_GET_ARG_COUNT_(...)                                                 \
+  TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54,  \
+                     53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40,   \
+                     39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,   \
+                     25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12,   \
+                     11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, )
+#endif
+
+#define TF_GET_ARG_COUNT(...) TF_GET_ARG_COUNT_(__dummy__, ##__VA_ARGS__)
+#define TF_GET_ARG_COUNT_I(                                                    \
+    e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, \
+    e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, \
+    e32, e33, e34, e35, e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, \
+    e47, e48, e49, e50, e51, e52, e53, e54, e55, e56, e57, e58, e59, e60, e61, \
+    e62, e63, e64, size, ...)                                                  \
+  size
+
+#define TF_GET_FIRST(a, ...) a
+#define TF_GET_SECOND(a, b, ...) b
+#define TF_CONCATE(x, y) x##y
+#define TF_PASTE(x, y) TF_CONCATE(x, y)
+
+#define TF_EVAL0(...) __VA_ARGS__
+#define TF_EVAL1(...) TF_EVAL0(TF_EVAL0(TF_EVAL0(__VA_ARGS__)))
+#define TF_EVAL2(...) TF_EVAL1(TF_EVAL1(TF_EVAL1(__VA_ARGS__)))
+#define TF_EVAL3(...) TF_EVAL2(TF_EVAL2(TF_EVAL2(__VA_ARGS__)))
+#define TF_EVAL4(...) TF_EVAL3(TF_EVAL3(TF_EVAL3(__VA_ARGS__)))
+#define TF_EVAL5(...) TF_EVAL4(TF_EVAL4(TF_EVAL4(__VA_ARGS__)))
+
+#ifdef _MSC_VER
+// MSVC needs more evaluations
+#define TF_EVAL6(...) TF_EVAL5(TF_EVAL5(TF_EVAL5(__VA_ARGS__)))
+#define TF_EVAL(...) TF_EVAL6(TF_EVAL6(__VA_ARGS__))
+#else
+#define TF_EVAL(...) TF_EVAL5(__VA_ARGS__)
+#endif
+
+#define TF_MAP_END(...)
+#define TF_MAP_OUT
+
+#define EMPTY()
+#define DEFER(id) id EMPTY()
+
+#define TF_MAP_GET_END2() 0, TF_MAP_END
+#define TF_MAP_GET_END1(...) TF_MAP_GET_END2
+#define TF_MAP_GET_END(...) TF_MAP_GET_END1
+#define TF_MAP_NEXT0(test, next, ...) next TF_MAP_OUT
+#define TF_MAP_NEXT1(test, next) DEFER(TF_MAP_NEXT0)(test, next, 0)
+#define TF_MAP_NEXT(test, next) TF_MAP_NEXT1(TF_MAP_GET_END test, next)
+
+#define TF_MAP0(f, x, peek, ...)                                               \
+  f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP1))(f, peek, __VA_ARGS__)
+#define TF_MAP1(f, x, peek, ...)                                               \
+  f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP0))(f, peek, __VA_ARGS__)
+
+#define TF_MAP(f, ...)                                                         \
+  TF_EVAL(TF_MAP1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
diff --git a/myxpcs/include/taskflow_/dsl/task_analyzer.hpp b/myxpcs/include/taskflow_/dsl/task_analyzer.hpp
new file mode 100644
index 0000000..295c50b
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/task_analyzer.hpp
@@ -0,0 +1,40 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include "connection.hpp"
+#include "type_list.hpp"
+#include <type_traits>
+
+namespace tf {
+namespace dsl {
+template <typename... Links> class TaskAnalyzer {
+  template <typename FROMs, typename TOs, typename = void>
+  struct BuildOneToOneLink;
+
+  template <typename... Fs, typename Ts>
+  struct BuildOneToOneLink<TypeList<Fs...>, Ts> {
+    using type = Concat_t<typename BuildOneToOneLink<Fs, Ts>::type...>;
+  };
+
+  template <typename F, typename... Ts>
+  struct BuildOneToOneLink<F, TypeList<Ts...>,
+                           std::enable_if_t<!IsTypeList_v<F>>> {
+    using type = TypeList<OneToOneLink<F, Ts>...>;
+  };
+
+  template <typename Link> class OneToOneLinkSetF {
+    using FromTaskList = typename Link::FromTaskList;
+    using ToTaskList = typename Link::ToTaskList;
+
+  public:
+    using type = typename BuildOneToOneLink<FromTaskList, ToTaskList>::type;
+  };
+
+public:
+  using AllTasks = Unique_t<
+      Concat_t<typename Links::FromTaskList..., typename Links::ToTaskList...>>;
+  using OneToOneLinkSet =
+      Unique_t<Flatten_t<Map_t<TypeList<Links...>, OneToOneLinkSetF>>>;
+};
+
+} // namespace dsl
+} // namespace tf
diff --git a/myxpcs/include/taskflow_/dsl/task_dsl.hpp b/myxpcs/include/taskflow_/dsl/task_dsl.hpp
new file mode 100644
index 0000000..9b362cf
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/task_dsl.hpp
@@ -0,0 +1,104 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include "../core/flow_builder.hpp"
+#include "meta_macro.hpp"
+#include "task_analyzer.hpp"
+#include "task_trait.hpp"
+
+namespace tf {
+namespace dsl {
+struct EmptyContext {};
+template <typename CONTEXT = EmptyContext, typename... Chains> class TaskDsl {
+  using Links = Unique_t<Flatten_t<TypeList<typename Chain<Chains>::type...>>>;
+  using Analyzer = typename Links::template exportTo<TaskAnalyzer>;
+
+  using AllTasks = typename Analyzer::AllTasks;
+
+  template <typename TASK> struct TaskCbWithContext {
+    using type = TaskCb<TASK, CONTEXT>;
+  };
+  using TasksCB =
+      typename Map_t<AllTasks,
+                     TaskCbWithContext>::template exportTo<std::tuple>;
+
+  using OneToOneLinkSet = typename Analyzer::OneToOneLinkSet;
+  template <typename OneToOneLink> struct OneToOneLinkInstanceType {
+    using type = typename OneToOneLink::template InstanceType<TasksCB>;
+  };
+  using OneToOneLinkInstances =
+      typename Map_t<OneToOneLinkSet,
+                     OneToOneLinkInstanceType>::template exportTo<std::tuple>;
+
+public:
+  constexpr TaskDsl(FlowBuilder &flow_builder, const CONTEXT &context = {}) {
+    build_tasks_cb(flow_builder, context,
+                   std::make_index_sequence<AllTasks::size>{});
+    build_links(std::make_index_sequence<OneToOneLinkSet::size>{});
+  }
+
+  template <typename TASK> Task &get_task() {
+    constexpr size_t TasksCBSize = std::tuple_size<TasksCB>::value;
+    constexpr size_t TaskIndex =
+        TupleElementByF_v<TasksCB, IsTask<TASK>::template apply>;
+    static_assert(TaskIndex < TasksCBSize, "fatal: not find TaskCb in TasksCB");
+    return std::get<TaskIndex>(tasksCb_).task_;
+  }
+
+private:
+  template <size_t... Is>
+  void build_tasks_cb(FlowBuilder &flow_builder, const CONTEXT &context,
+                      std::index_sequence<Is...>) {
+    auto _ = {0, (std::get<Is>(tasksCb_).build(flow_builder, context), 0)...};
+    (void)_;
+  }
+
+  template <size_t... Is> void build_links(std::index_sequence<Is...>) {
+    auto _ = {0, (std::get<Is>(links_).build(tasksCb_), 0)...};
+    (void)_;
+  }
+
+private:
+  TasksCB tasksCb_;
+  OneToOneLinkInstances links_;
+};
+
+template <typename = void, typename... Chains, typename CONTEXT = EmptyContext>
+constexpr TaskDsl<CONTEXT, Chains...> taskDsl(FlowBuilder &flow_builder,
+                                              CONTEXT &&context = {}) {
+  return {flow_builder, context};
+}
+
+} // namespace dsl
+} // namespace tf
+
+///////////////////////////////////////////////////////////////////////////////
+#define TF_CHAIN(link) , link->void
+#define TF_CONTEXT_1(name) tf::dsl::EmptyContext
+#define TF_CONTEXT_2(name, context) context
+#define TF_CAPTURE_THIS_1
+#define TF_CAPTURE_THIS_2 *this
+
+///////////////////////////////////////////////////////////////////////////////
+// make_task(TASK_NAME, { return a action lambda })
+#define make_task(name, ...)                                                    \
+  struct TF_GET_FIRST name : tf::dsl::TaskSignature,                           \
+                             TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name)      \
+                                 name {                                        \
+    using _ContextType = TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name) name;    \
+    TF_GET_FIRST name(const _ContextType &context) : _ContextType(context) {}  \
+    auto operator()() {                                                        \
+      return [TF_PASTE(TF_CAPTURE_THIS_, TF_GET_ARG_COUNT name)] __VA_ARGS__;  \
+    }                                                                          \
+  }
+
+// some_tasks(A, B, C) means SomeTask
+#define some_tasks(...) auto (*)(tf::dsl::SomeTask<__VA_ARGS__>)
+// same as some_tasks
+#define fork_tasks(...) some_tasks(__VA_ARGS__)
+// same as some_tasks
+#define merge_tasks(...) some_tasks(__VA_ARGS__)
+// task(A) means a task A
+#define task(Task) auto (*)(Task)
+// taskbuild(...) build a task dsl graph
+#define build_taskflow(...) tf::dsl::taskDsl<void TF_MAP(TF_CHAIN, __VA_ARGS__)>
+
diff --git a/myxpcs/include/taskflow_/dsl/task_trait.hpp b/myxpcs/include/taskflow_/dsl/task_trait.hpp
new file mode 100644
index 0000000..bc8eeb6
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/task_trait.hpp
@@ -0,0 +1,46 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include "../core/flow_builder.hpp"
+#include "../core/task.hpp"
+#include "type_list.hpp"
+#include <type_traits>
+
+namespace tf {
+namespace dsl {
+struct TaskSignature {};
+
+template <typename TASK, typename CONTEXT> struct TaskCb {
+  using TaskType = TASK;
+  void build(FlowBuilder &build, const CONTEXT &context) {
+    task_ = build.emplace(TaskType{context}());
+  }
+
+  Task task_;
+};
+
+template <typename TASK> struct IsTask {
+  template <typename TaskCb> struct apply {
+    constexpr static bool value =
+        std::is_same<typename TaskCb::TaskType, TASK>::value;
+  };
+};
+
+template <typename TASK, typename = void> struct TaskTrait;
+
+template <typename... TASK> struct SomeTask {
+  using TaskList =
+      Unique_t<Flatten_t<TypeList<typename TaskTrait<TASK>::TaskList...>>>;
+};
+
+// a task self
+template <typename TASK>
+struct TaskTrait<
+    TASK, std::enable_if_t<std::is_base_of<TaskSignature, TASK>::value>> {
+  using TaskList = TypeList<TASK>;
+};
+
+template <typename... TASK> struct TaskTrait<SomeTask<TASK...>> {
+  using TaskList = typename SomeTask<TASK...>::TaskList;
+};
+} // namespace dsl
+} // namespace tf
diff --git a/myxpcs/include/taskflow_/dsl/tuple_utils.hpp b/myxpcs/include/taskflow_/dsl/tuple_utils.hpp
new file mode 100644
index 0000000..633ba0e
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/tuple_utils.hpp
@@ -0,0 +1,43 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include <cstddef>
+#include <tuple>
+
+namespace tf {
+namespace dsl {
+namespace detail {
+// get tuple element index by f, if not exists then index >= tuple_size
+template <typename TUP, template <typename> class F, typename = void>
+struct TupleElementByF {
+  constexpr static size_t Index = 0;
+};
+
+template <template <typename> class F, typename H, typename... Ts>
+struct TupleElementByF<std::tuple<H, Ts...>, F, std::enable_if_t<F<H>::value>> {
+  constexpr static size_t Index = 0;
+};
+
+template <template <typename> class F, typename H, typename... Ts>
+struct TupleElementByF<std::tuple<H, Ts...>, F,
+                       std::enable_if_t<!F<H>::value>> {
+  constexpr static size_t Index =
+      1 + TupleElementByF<std::tuple<Ts...>, F>::Index;
+};
+
+template <typename T, typename TUP, size_t... Is>
+constexpr inline T AggregationByTupImpl(TUP &&tup, std::index_sequence<Is...>) {
+  return T{std::get<Is>(tup)...};
+}
+} // namespace detail
+
+template <typename TUP, template <typename> class F>
+constexpr size_t TupleElementByF_v = detail::TupleElementByF<TUP, F>::Index;
+
+template <typename T, typename TUP>
+constexpr inline T AggregationByTup(TUP &&tup) {
+  return detail::AggregationByTupImpl<T>(
+      std::forward<TUP>(tup),
+      std::make_index_sequence<std::tuple_size<std::decay_t<TUP>>::size>{});
+}
+} // namespace dsl
+} // namespace tf
diff --git a/myxpcs/include/taskflow_/dsl/type_list.hpp b/myxpcs/include/taskflow_/dsl/type_list.hpp
new file mode 100644
index 0000000..c4af4a4
--- /dev/null
+++ b/myxpcs/include/taskflow_/dsl/type_list.hpp
@@ -0,0 +1,136 @@
+// 2020/08/28 - Created by netcan: https://github.com/netcan
+#pragma once
+#include <cstddef>
+
+namespace tf {
+namespace dsl {
+template <typename...> using void_t = void;
+
+template <typename... Ts> struct TypeList {
+  using type = TypeList<Ts...>;
+  static constexpr size_t size = 0;
+
+  template <typename... T> struct append { using type = TypeList<T...>; };
+  template <typename... T> using appendTo = typename append<T...>::type;
+
+  template <typename T> using prepend = typename TypeList<T>::type;
+
+  template <template <typename...> class T> using exportTo = T<Ts...>;
+};
+
+template <typename Head, typename... Tails> struct TypeList<Head, Tails...> {
+  using type = TypeList<Head, Tails...>;
+  using head = Head;
+  using tails = TypeList<Tails...>;
+  static constexpr size_t size = sizeof...(Tails) + 1;
+
+  template <typename... Ts> struct append {
+    using type = TypeList<Head, Tails..., Ts...>;
+  };
+  template <typename... Ts> using appendTo = typename append<Ts...>::type;
+
+  template <typename T>
+  using prepend = typename TypeList<T, Head, Tails...>::type;
+
+  template <template <typename...> class T> using exportTo = T<Head, Tails...>;
+};
+
+template <typename IN> struct IsTypeList {
+  constexpr static bool value = false;
+};
+
+template <typename IN> constexpr bool IsTypeList_v = IsTypeList<IN>::value;
+
+template <typename... Ts> struct IsTypeList<TypeList<Ts...>> {
+  constexpr static bool value = true;
+};
+
+template <typename... IN> struct Concat;
+
+template <typename... IN> using Concat_t = typename Concat<IN...>::type;
+
+template <> struct Concat<> { using type = TypeList<>; };
+template <typename IN> struct Concat<IN> { using type = IN; };
+
+template <typename IN, typename IN2> struct Concat<IN, IN2> {
+  using type = typename IN2::template exportTo<IN::template append>::type;
+};
+
+template <typename IN, typename IN2, typename... Rest>
+struct Concat<IN, IN2, Rest...> {
+  using type = Concat_t<Concat_t<IN, IN2>, Rest...>;
+};
+
+template <typename IN, typename OUT = TypeList<>, typename = void>
+struct Flatten {
+  using type = OUT;
+};
+
+template <typename IN> using Flatten_t = typename Flatten<IN>::type;
+
+template <typename IN, typename OUT>
+struct Flatten<IN, OUT, std::enable_if_t<IsTypeList_v<typename IN::head>>> {
+  using type =
+      typename Flatten<typename IN::tails,
+                       Concat_t<OUT, Flatten_t<typename IN::head>>>::type;
+};
+
+template <typename IN, typename OUT>
+struct Flatten<IN, OUT, std::enable_if_t<!IsTypeList_v<typename IN::head>>> {
+  using type = typename Flatten<
+      typename IN::tails,
+      typename OUT::template appendTo<typename IN::head>>::type;
+};
+
+template <typename IN, template <typename> class F> struct Map {
+  using type = TypeList<>;
+};
+
+template <typename IN, template <typename> class F>
+using Map_t = typename Map<IN, F>::type;
+
+template <template <typename> class F, typename... Ts>
+struct Map<TypeList<Ts...>, F> {
+  using type = TypeList<typename F<Ts>::type...>;
+};
+
+template <typename IN, template <typename> class F, typename OUT = TypeList<>,
+          typename = void>
+struct Filter {
+  using type = OUT;
+};
+
+template <typename IN, template <typename> class F>
+using Filter_t = typename Filter<IN, F>::type;
+
+template <typename IN, template <typename> class F, typename OUT>
+class Filter<IN, F, OUT, void_t<typename IN::head>> {
+  using H = typename IN::head;
+
+public:
+  using type = typename std::conditional_t<
+      F<H>::value,
+      Filter<typename IN::tails, F, typename OUT::template appendTo<H>>,
+      Filter<typename IN::tails, F, OUT>>::type;
+};
+
+template <typename IN, typename = void> struct Unique { using type = IN; };
+
+template <typename IN> using Unique_t = typename Unique<IN>::type;
+
+template <typename IN> class Unique<IN, void_t<typename IN::head>> {
+  template <typename T> struct IsDifferR {
+    template <typename R> struct apply {
+      static constexpr bool value = !std::is_same<T, R>::value;
+    };
+  };
+
+  using tails = Unique_t<typename IN::tails>;
+  using eraseHead =
+      Filter_t<tails, IsDifferR<typename IN::head>::template apply>;
+
+public:
+  using type = typename eraseHead::template prepend<typename IN::head>;
+};
+} // namespace dsl
+} // namespace tf
diff --git a/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp b/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp
new file mode 100644
index 0000000..17dfa98
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp
@@ -0,0 +1,487 @@
+#pragma once
+
+#include "../syclflow.hpp"
+
+namespace tf::detail {
+
+// ----------------------------------------------------------------------------
+// reduction helper functions
+// ----------------------------------------------------------------------------
+
+/** @private */
+template<unsigned nt, typename T>
+struct syclBlockReduce {
+
+  static const unsigned group_size = std::min(nt, SYCL_WARP_SIZE);
+  static const unsigned shm_size   = std::max(nt, 2* group_size);
+  static const unsigned num_passes = log2(group_size);
+  static const unsigned num_items  = nt / group_size;
+
+  static_assert(
+    nt && (0 == nt % SYCL_WARP_SIZE),
+    "syclBlockReduce requires num threads to be a multiple of warp_size (32)"
+  );
+
+  using shm_t = sycl::accessor<
+    T, 1, sycl::access::mode::read_write, sycl::access::target::local
+  >;
+
+  template<typename op_t>
+  T operator()(
+    sycl::nd_item<1>&, T, const shm_t&, unsigned, op_t, bool = true
+  ) const;
+};
+
+// function: reduce to be called from a block
+template<unsigned nt, typename T>
+template<typename op_t>
+T syclBlockReduce<nt, T>::operator ()(
+  sycl::nd_item<1>& item,
+  T x,
+  const shm_t& shm,
+  unsigned count,
+  op_t op,
+  bool ret
+) const {
+
+  auto tid = item.get_local_id(0);
+
+  // Store your data into shared memory.
+  shm[tid] = x;
+  item.barrier(sycl::access::fence_space::local_space);
+
+  if(tid < group_size) {
+    // Each thread scans within its lane.
+    sycl_strided_iterate<group_size, num_items>([&](auto i, auto j) {
+      if(i > 0) {
+        x = op(x, shm[j]);
+      }
+    }, tid, count);
+    shm[tid] = x;
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  auto count2 = count < group_size ? count : group_size;
+  auto first = (1 & num_passes) ? group_size : 0;
+  if(tid < group_size) {
+    shm[first + tid] = x;
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  sycl_iterate<num_passes>([&](auto pass) {
+    if(tid < group_size) {
+      if(auto offset = 1 << pass; tid + offset < count2) {
+        x = op(x, shm[first + offset + tid]);
+      }
+      first = group_size - first;
+      shm[first + tid] = x;
+    }
+    item.barrier(sycl::access::fence_space::local_space);
+  });
+
+  if(ret) {
+    x = shm[0];
+    item.barrier(sycl::access::fence_space::local_space);
+  }
+  return x;
+}
+
+/** @private */
+template <typename P, typename I, typename T, typename O>
+sycl::event sycl_reduce_loop(
+  P&& p,
+  I input,
+  unsigned count,
+  T* res,
+  O op,
+  bool incl,
+  void* ptr,
+  std::vector<sycl::event> evs
+) {
+
+  using E = std::decay_t<P>;
+  using R = syclBlockReduce<E::nt, T>;
+
+  auto buf = static_cast<T*>(ptr);
+  auto B   = (count + E::nv - 1) / E::nv;
+
+  auto e = p.queue().submit([=, evs=std::move(evs)](sycl::handler& h) {
+
+    h.depends_on(evs);
+
+    // create a shared memory
+    typename R::shm_t shm(sycl::range<1>(R::shm_size), h);
+
+    h.parallel_for(
+      sycl::nd_range<1>{sycl::range<1>(B*E::nt), sycl::range<1>(E::nt)},
+      [=](sycl::nd_item<1> item) {
+
+        auto tid = item.get_local_id(0);
+        auto bid = item.get_group(0);
+
+        // get the tile of this group
+        auto tile = sycl_get_tile(bid, E::nv, count);
+
+        // load data from input to register
+        auto x = sycl_mem_to_reg_strided<E::nt, E::vt>(
+          input + tile.begin, tid, tile.count()
+        );
+        // reduce multiple values per thread into a scalar.
+        T s;
+        sycl_strided_iterate<E::nt, E::vt>(
+          [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
+        );
+        // reduce to a scalar per block.
+        s = R()(
+          item, s, shm, (tile.count()<E::nt ? tile.count() : E::nt), op, false
+        );
+        if(!tid) {
+          (1 == B) ? *res = (incl ? op(*res, s) : s) : buf[bid] = s;
+        }
+      }
+    );
+  });
+
+  if(B > 1) {
+    return sycl_reduce_loop(p, buf, B, res, op, incl, buf+B, {e});
+  }
+  else {
+    return e;
+  }
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+namespace tf {
+
+/**
+@brief queries the buffer size in bytes needed to call reduce kernels
+
+@tparam P execution policy type
+@tparam T value type
+
+@param count number of elements to reduce
+
+The function is used to allocate a buffer for calling asynchronous reduce.
+Please refer to @ref SYCLSTDReduce for details.
+*/
+template <typename P, typename T>
+unsigned sycl_reduce_buffer_size(unsigned count) {
+  using E = std::decay_t<P>;
+  unsigned B = (count + E::nv - 1) / E::nv;
+  unsigned n = 0;
+  for(auto b=B; b>1; n += (b=(b+E::nv-1)/E::nv));
+  return n*sizeof(T);
+}
+
+//// sycl reduction
+//template <typename I, typename T, typename C, bool uninitialized>
+//auto syclFlow::_reduce_cgh(I first, I last, T* res, C&& op) {
+//
+//  // TODO: special case N == 0?
+//  size_t N = std::distance(first, last);
+//  size_t B = _default_group_size(N);
+//
+//  return [=, op=std::forward<C>(op)](sycl::handler& handler) mutable {
+//
+//    // create a shared memory
+//    sycl::accessor<
+//      T, 1, sycl::access::mode::read_write, sycl::access::target::local
+//    > shm(sycl::range<1>(B), handler);
+//
+//    // perform parallel reduction
+//    handler.parallel_for(
+//      sycl::nd_range<1>{sycl::range<1>(B), sycl::range<1>(B)},
+//      [=] (sycl::nd_item<1> item) {
+//
+//      size_t tid = item.get_global_id(0);
+//
+//      if(tid >= N) {
+//        return;
+//      }
+//
+//      shm[tid] = *(first+tid);
+//
+//      for(size_t i=tid+B; i<N; i+=B) {
+//        shm[tid] = op(shm[tid], *(first+i));
+//      }
+//
+//      item.barrier(sycl::access::fence_space::local_space);
+//
+//      for(size_t s = B / 2; s > 0; s >>= 1) {
+//        if(tid < s && tid + s < N) {
+//          shm[tid] = op(shm[tid], shm[tid+s]);
+//        }
+//        item.barrier(sycl::access::fence_space::local_space);
+//      }
+//
+//      if(tid == 0) {
+//        if constexpr (uninitialized) {
+//          *res = shm[0];
+//        }
+//        else {
+//          *res = op(*res, shm[0]);
+//        }
+//      }
+//    });
+//  };
+//}
+
+// ----------------------------------------------------------------------------
+// SYCL standard reduce algorithms
+// ----------------------------------------------------------------------------
+
+/**
+@brief performs parallel reduction over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+
+This method is equivalent to the parallel execution of the following loop
+on a SYCL device:
+
+@code{.cpp}
+while (first != last) {
+  *result = op(*result, *first++);
+}
+@endcode
+ */
+template<typename P, typename I, typename T, typename O>
+void sycl_reduce(P&& p, I first, I last, T* res, O op) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // allocate temporary buffer
+  auto tmp = sycl::malloc_device(
+    sycl_reduce_buffer_size<P, T>(count), p.queue()
+  );
+
+  // reduction loop
+  detail::sycl_reduce_loop(p, first, count, res, op, true, tmp, {}).wait();
+
+  // deallocate the temporary buffer
+  sycl::free(tmp, p.queue());
+}
+
+/**
+@brief performs asynchronous parallel reduction over a range of items
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+@param buf pointer to the temporary buffer
+
+@return an SYCL event
+
+Please refer to @ref SYCLSTDReduce for details.
+ */
+template<typename P, typename I, typename T, typename O>
+sycl::event sycl_reduce_async(
+  P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return {};
+  }
+
+  // reduction loop
+  return detail::sycl_reduce_loop(
+    p, first, count, res, op, true, buf, std::move(dep)
+  );
+}
+
+/**
+@brief performs parallel reduction over a range of items
+       without an initial value
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+
+This method is equivalent to the parallel execution of the following loop
+on a SYCL device:
+
+@code{.cpp}
+*result = *first++;  // no initial values partitipcate in the loop
+while (first != last) {
+  *result = op(*result, *first++);
+}
+@endcode
+*/
+template<typename P, typename I, typename T, typename O>
+void sycl_uninitialized_reduce(P&& p, I first, I last, T* res, O op) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return;
+  }
+
+  // allocate temporary buffer
+  auto tmp = sycl::malloc_device(
+    sycl_reduce_buffer_size<P, T>(count), p.queue()
+  );
+
+  // reduction loop
+  detail::sycl_reduce_loop(p, first, count, res, op, false, tmp, {}).wait();
+
+  // deallocate the temporary buffer
+  sycl::free(tmp, p.queue());
+}
+
+/**
+@brief performs asynchronous parallel reduction over a range of items
+       without an initial value
+
+@tparam P execution policy type
+@tparam I input iterator type
+@tparam T value type
+@tparam O binary operator type
+
+@param p execution policy
+@param first iterator to the beginning of the range
+@param last iterator to the end of the range
+@param res pointer to the result
+@param op binary operator to apply to reduce elements
+@param buf pointer to the temporary buffer
+
+@return an SYCL event
+
+Please refer to @ref SYCLSTDReduce for details.
+*/
+template<typename P, typename I, typename T, typename O>
+sycl::event sycl_uninitialized_reduce_async(
+  P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep
+) {
+
+  unsigned count = std::distance(first, last);
+
+  if(count == 0) {
+    return {};
+  }
+
+  // reduction loop
+  return detail::sycl_reduce_loop(
+    p, first, count, res, op, false, buf, std::move(dep)
+  );
+}
+
+// ----------------------------------------------------------------------------
+// syclFlow reduce
+// ----------------------------------------------------------------------------
+
+// Function: reduce
+template <typename I, typename T, typename C>
+syclTask syclFlow::reduce(I first, I last, T* res, C&& op) {
+
+  //return on(_reduce_cgh<I, T, C, false>(first, last, res, std::forward<C>(op)));
+
+  auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
+    std::distance(first, last)
+  );
+
+  return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
+  (sycl::queue& queue, std::vector<sycl::event> events) mutable {
+    syclDefaultExecutionPolicy p(queue);
+    return sycl_reduce_async(
+      p, first, last, res, op, buf.get().data(), std::move(events)
+    );
+  });
+}
+
+// Function: uninitialized_reduce
+template <typename I, typename T, typename C>
+syclTask syclFlow::uninitialized_reduce(I first, I last, T* res, C&& op) {
+  //return on(_reduce_cgh<I, T, C, true>(first, last, res, std::forward<C>(op)));
+
+  auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
+    std::distance(first, last)
+  );
+
+  return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
+  (sycl::queue& queue, std::vector<sycl::event> events) mutable {
+    syclDefaultExecutionPolicy p(queue);
+    return sycl_uninitialized_reduce_async(
+      p, first, last, res, op, buf.get().data(), std::move(events)
+    );
+  });
+
+}
+
+// ----------------------------------------------------------------------------
+// rebind methods
+// ----------------------------------------------------------------------------
+
+//// Function: reduce
+//template <typename I, typename T, typename C>
+//void syclFlow::reduce(syclTask task, I first, I last, T* res, C&& op) {
+//  //on(task, _reduce_cgh<I, T, C, false>(
+//  //  first, last, res, std::forward<C>(op)
+//  //));
+//
+//  auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
+//    std::distance(first, last)
+//  );
+//
+//  on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
+//  (sycl::queue& queue, std::vector<sycl::event> events) mutable {
+//    syclDefaultExecutionPolicy p(queue);
+//    return sycl_reduce_async(
+//      p, first, last, res, op, buf.get().data(), std::move(events)
+//    );
+//  });
+//}
+//
+//// Function: uninitialized_reduce
+//template <typename I, typename T, typename C>
+//void syclFlow::uninitialized_reduce(
+//  syclTask task, I first, I last, T* res, C&& op
+//) {
+//  //on(task, _reduce_cgh<I, T, C, true>(
+//  //  first, last, res, std::forward<C>(op)
+//  //));
+//  auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
+//    std::distance(first, last)
+//  );
+//
+//  on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
+//  (sycl::queue& queue, std::vector<sycl::event> events) mutable {
+//    syclDefaultExecutionPolicy p(queue);
+//    return sycl_uninitialized_reduce_async(
+//      p, first, last, res, op, buf.get().data(), std::move(events)
+//    );
+//  });
+//}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp b/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp
new file mode 100644
index 0000000..e61fa62
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "../sycl_flow.hpp"
+
+namespace tf {
+
+// command group function object of for_each
+template <typename I, typename C>
+auto syclFlow::_for_each_cgh(I first, I last, C&& op) {
+
+  // TODO: special case N == 0?
+  size_t N = std::distance(first, last);
+  size_t B = _default_group_size(N);
+
+  return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable {
+    size_t _N = (N % B == 0) ? N : (N + B - N % B);
+    handler.parallel_for(
+      sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)},
+      [=] (sycl::nd_item<1> item) {
+        size_t i = item.get_global_id(0);
+        if(i < N) {
+          op(*(first + i));
+        }
+      }
+    );
+  };
+}
+
+// command group function object of for_each_index
+template <typename I, typename C>
+auto syclFlow::_for_each_index_cgh(I first, I last, I step, C&& op) {
+
+  if(is_range_invalid(first, last, step)) {
+    TF_THROW("invalid range [", first, ", ", last, ") with step size ", step);
+  }
+
+  // TODO: special case when N is 0?
+  size_t N = distance(first, last, step);
+  size_t B = _default_group_size(N);
+
+  return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable {
+    size_t _N = (N % B == 0) ? N : (N + B - N % B);
+    handler.parallel_for(
+      sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)},
+      [=] (sycl::nd_item<1> item) {
+        size_t i = item.get_global_id(0);
+        if(i < N) {
+          op(static_cast<I>(i)*step + first);
+        }
+      }
+    );
+  };
+}
+
+// ----------------------------------------------------------------------------
+// for_each and for_each_index algorithms
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename I, typename C>
+syclTask syclFlow::for_each(I first, I last, C&& op) {
+  return on(_for_each_cgh(first, last, std::forward<C>(op)));
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+syclTask syclFlow::for_each_index(I beg, I end, I inc, C&& op) {
+  return on(_for_each_index_cgh(beg, end, inc, std::forward<C>(op)));
+}
+
+// ----------------------------------------------------------------------------
+// rebind
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename I, typename C>
+void syclFlow::for_each(syclTask task, I first, I last, C&& op) {
+  on(task, _for_each_cgh(first, last, std::forward<C>(op)));
+}
+
+// Function: for_each_index
+template <typename I, typename C>
+void syclFlow::for_each_index(syclTask task, I beg, I end, I inc, C&& op) {
+  on(task, _for_each_index_cgh(beg, end, inc, std::forward<C>(op)));
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp b/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp
new file mode 100644
index 0000000..b4372e2
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "../sycl_flow.hpp"
+
+namespace tf {
+
+// Function: _transform_cgh
+template <typename I, typename C, typename... S>
+auto syclFlow::_transform_cgh(I first, I last, C&& op, S... srcs) {
+
+  // TODO: special case N == 0?
+  size_t N = std::distance(first, last);
+  size_t B = _default_group_size(N);
+
+  return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable {
+
+    size_t _N = (N % B == 0) ? N : (N + B - N % B);
+
+    handler.parallel_for(
+      sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)},
+      [=] (sycl::nd_item<1> item) {
+        size_t i = item.get_global_id(0);
+        if(i < N) {
+          *(first + i) = op(*(srcs + i)...);
+        }
+      }
+    );
+  };
+}
+
+// Function: transform
+template <typename I, typename C, typename... S>
+syclTask syclFlow::transform(I first, I last, C&& op, S... srcs) {
+  return on(_transform_cgh(first, last, std::forward<C>(op), srcs...));
+}
+
+// Procedure: transform
+template <typename I, typename C, typename... S>
+void syclFlow::transform(
+  syclTask task, I first, I last, C&& op, S... srcs
+) {
+  on(task, _transform_cgh(first, last, std::forward<C>(op), srcs...));
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp b/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp
new file mode 100644
index 0000000..ceee08a
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp
@@ -0,0 +1,70 @@
+#pragma once
+
+/**
+@file sycl_execution_policy.hpp
+@brief SYCL execution policy include file
+*/
+
+namespace tf {
+
+/**
+@class syclExecutionPolicy
+
+@brief class to define execution policy for SYCL standard algorithms
+
+@tparam NT number of threads per block
+@tparam VT number of work units per thread
+
+Execution policy configures the kernel execution parameters in SYCL algorithms.
+The first template argument, @c NT, the number of threads per block should
+always be a power-of-two number.
+The second template argument, @c VT, the number of work units per thread
+is recommended to be an odd number to avoid bank conflict.
+
+Details can be referred to @ref SYCLSTDExecutionPolicy.
+*/
+template<unsigned NT, unsigned VT>
+class syclExecutionPolicy {
+
+  static_assert(is_pow2(NT), "max # threads per block must be a power of two");
+
+  public:
+
+  /** @brief static constant for getting the number of threads per block */
+  const static unsigned nt = NT;
+
+  /** @brief static constant for getting the number of work units per thread */
+  const static unsigned vt = VT;
+
+  /** @brief static constant for getting the number of elements to process per block */
+  const static unsigned nv = NT*VT;
+
+  /**
+  @brief constructs an execution policy object with the given queue
+   */
+  syclExecutionPolicy(sycl::queue& queue) : _queue{queue} {}
+
+  /**
+  @brief returns an mutable reference to the associated queue
+   */
+  sycl::queue& queue() noexcept { return _queue; };
+
+  /**
+  @brief returns an immutable reference to the associated queue
+   */
+  const sycl::queue& queue() const noexcept { return _queue; }
+
+  private:
+
+  sycl::queue& _queue;
+};
+
+/**
+@brief default execution policy
+ */
+using syclDefaultExecutionPolicy = syclExecutionPolicy<512, 9>;
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/sycl/sycl_graph.hpp b/myxpcs/include/taskflow_/sycl/sycl_graph.hpp
new file mode 100644
index 0000000..3a6f786
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/sycl_graph.hpp
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <CL/sycl.hpp>
+
+#include "sycl_meta.hpp"
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// syclGraph class
+// ----------------------------------------------------------------------------
+
+// class: syclGraph
+class syclGraph : public CustomGraphBase {
+
+  friend class syclNode;
+  friend class syclTask;
+  friend class syclFlow;
+  friend class Taskflow;
+  friend class Executor;
+
+  constexpr static int OFFLOADED = 0x01;
+  constexpr static int TOPOLOGY_CHANGED = 0x02;
+
+  public:
+
+    syclGraph() = default;
+    ~syclGraph() = default;
+
+    syclGraph(const syclGraph&) = delete;
+    syclGraph(syclGraph&&);
+
+    syclGraph& operator = (const syclGraph&) = delete;
+    syclGraph& operator = (syclGraph&&);
+
+    template <typename... ArgsT>
+    syclNode* emplace_back(ArgsT&&...);
+
+    bool empty() const;
+
+    void clear();
+    void dump(std::ostream&, const void*, const std::string&) const override final;
+
+  private:
+
+    int _state {0};
+
+    std::vector<std::unique_ptr<syclNode>> _nodes;
+};
+
+// ----------------------------------------------------------------------------
+// syclNode definitions
+// ----------------------------------------------------------------------------
+
+// class: syclNode
+class syclNode {
+
+  friend class syclGraph;
+  friend class syclTask;
+  friend class syclFlow;
+  friend class Taskflow;
+  friend class Executor;
+
+  struct Empty {
+  };
+
+  struct CGH {
+
+    std::function<void(sycl::handler&)> work;
+
+    template <typename F>
+    CGH(F&& func) : work {std::forward<F>(func)} {}
+  };
+
+  using handle_t = std::variant<
+    Empty,
+    CGH
+  >;
+
+  public:
+
+  // variant index
+  constexpr static auto EMPTY = get_index_v<Empty, handle_t>;
+  constexpr static auto COMMAND_GROUP_HANDLER = get_index_v<CGH, handle_t>;
+
+    syclNode() = delete;
+
+    template <typename... ArgsT>
+    syclNode(syclGraph&, ArgsT&&...);
+
+  private:
+
+    syclGraph& _graph;
+
+    std::string _name;
+
+    int _level;
+
+    sycl::event _event;
+
+    handle_t _handle;
+
+    SmallVector<syclNode*> _successors;
+    SmallVector<syclNode*> _dependents;
+
+    void _precede(syclNode*);
+};
+
+// ----------------------------------------------------------------------------
+// syclNode definitions
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename... ArgsT>
+syclNode::syclNode(syclGraph& g, ArgsT&&... args) :
+  _graph  {g},
+  _handle {std::forward<ArgsT>(args)...} {
+}
+
+// Procedure: _precede
+inline void syclNode::_precede(syclNode* v) {
+  _graph._state |= syclGraph::TOPOLOGY_CHANGED;
+  _successors.push_back(v);
+  v->_dependents.push_back(this);
+}
+
+// ----------------------------------------------------------------------------
+// syclGraph definitions
+// ----------------------------------------------------------------------------
+
+// Move constructor
+inline syclGraph::syclGraph(syclGraph&& g) :
+  _nodes {std::move(g._nodes)} {
+
+  assert(g._nodes.empty());
+}
+
+// Move assignment
+inline syclGraph& syclGraph::operator = (syclGraph&& rhs) {
+
+  // lhs
+  _nodes = std::move(rhs._nodes);
+
+  assert(rhs._nodes.empty());
+
+  return *this;
+}
+
+// Function: empty
+inline bool syclGraph::empty() const {
+  return _nodes.empty();
+}
+
+// Procedure: clear
+inline void syclGraph::clear() {
+  _state = syclGraph::TOPOLOGY_CHANGED;
+  _nodes.clear();
+}
+
+// Function: emplace_back
+template <typename... ArgsT>
+syclNode* syclGraph::emplace_back(ArgsT&&... args) {
+
+  _state |= syclGraph::TOPOLOGY_CHANGED;
+
+  auto node = std::make_unique<syclNode>(std::forward<ArgsT>(args)...);
+  _nodes.emplace_back(std::move(node));
+  return _nodes.back().get();
+
+  // TODO: object pool
+
+  //auto node = new syclNode(std::forward<ArgsT>(args)...);
+  //_nodes.push_back(node);
+  //return node;
+}
+
+// Procedure: dump the graph to a DOT format
+inline void syclGraph::dump(
+  std::ostream& os, const void* root, const std::string& root_name
+) const {
+
+  // recursive dump with stack
+  std::stack<std::tuple<const syclGraph*, const syclNode*, int>> stack;
+  stack.push(std::make_tuple(this, nullptr, 1));
+
+  int pl = 0;
+
+  while(!stack.empty()) {
+
+    auto [graph, parent, l] = stack.top();
+    stack.pop();
+
+    for(int i=0; i<pl-l+1; i++) {
+      os << "}\n";
+    }
+
+    if(parent == nullptr) {
+      if(root) {
+        os << "subgraph cluster_p" << root << " {\nlabel=\"syclFlow: ";
+        if(root_name.empty()) os << 'p' << root;
+        else os << root_name;
+        os << "\";\n" << "color=\"red\"\n";
+      }
+      else {
+        os << "digraph syclFlow {\n";
+      }
+    }
+    else {
+      os << "subgraph cluster_p" << parent << " {\nlabel=\"syclSubflow: ";
+      if(parent->_name.empty()) os << 'p' << parent;
+      else os << parent->_name;
+      os << "\";\n" << "color=\"purple\"\n";
+    }
+
+    for(auto& v : graph->_nodes) {
+
+      os << 'p' << v.get() << "[label=\"";
+      if(v->_name.empty()) {
+        os << 'p' << v.get() << "\"";
+      }
+      else {
+        os << v->_name << "\"";
+      }
+      os << "];\n";
+
+      for(const auto s : v->_successors) {
+        os << 'p' << v.get() << " -> " << 'p' << s << ";\n";
+      }
+
+      if(v->_successors.size() == 0) {
+        if(parent == nullptr) {
+          if(root) {
+            os << 'p' << v.get() << " -> p" << root << ";\n";
+          }
+        }
+        else {
+          os << 'p' << v.get() << " -> p" << parent << ";\n";
+        }
+      }
+    }
+
+    // set the previous level
+    pl = l;
+  }
+
+  for(int i=0; i<pl; i++) {
+    os << "}\n";
+  }
+
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/sycl/sycl_meta.hpp b/myxpcs/include/taskflow_/sycl/sycl_meta.hpp
new file mode 100644
index 0000000..b3c4af1
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/sycl_meta.hpp
@@ -0,0 +1,517 @@
+#pragma once
+
+#include "sycl_execution_policy.hpp"
+
+namespace tf {
+
+// default warp size
+inline constexpr unsigned SYCL_WARP_SIZE = 32;
+
+// empty type
+struct syclEmpty { };
+
+// ----------------------------------------------------------------------------
+// iterator unrolling
+// ----------------------------------------------------------------------------
+
+// Template unrolled looping construct.
+template<unsigned i, unsigned count, bool valid = (i < count)>
+struct syclIterate {
+  template<typename F>
+  static void eval(F f) {
+    f(i);
+    syclIterate<i + 1, count>::eval(f);
+  }
+};
+
+template<unsigned i, unsigned count>
+struct syclIterate<i, count, false> {
+  template<typename F>
+  static void eval(F) { }
+};
+
+template<unsigned begin, unsigned end, typename F>
+void sycl_iterate(F f) {
+  syclIterate<begin, end>::eval(f);
+}
+
+template<unsigned count, typename F>
+void sycl_iterate(F f) {
+  sycl_iterate<0, count>(f);
+}
+
+template<unsigned count, typename T>
+T reduce(const T(&x)[count]) {
+  T y;
+  sycl_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; });
+  return y;
+}
+
+template<unsigned count, typename T>
+void fill(T(&x)[count], T val) {
+  sycl_iterate<count>([&](auto i) { x[i] = val; });
+}
+
+// Invoke unconditionally.
+template<unsigned nt, unsigned vt, typename F>
+void sycl_strided_iterate(F f, unsigned tid) {
+  sycl_iterate<vt>([=](auto i) { f(i, nt * i + tid); });
+}
+
+// Check range.
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F>
+void sycl_strided_iterate(F f, unsigned tid, unsigned count) {
+  // Unroll the first vt0 elements of each thread.
+  if(vt0 > 1 && count >= nt * vt0) {
+    sycl_strided_iterate<nt, vt0>(f, tid);    // No checking
+  } else {
+    sycl_iterate<vt0>([=](auto i) {
+      auto j = nt * i + tid;
+      if(j < count) f(i, j);
+    });
+  }
+
+  // TODO: seems dummy when vt0 == vt
+  sycl_iterate<vt0, vt>([=](auto i) {
+    auto j = nt * i + tid;
+    if(j < count) f(i, j);
+  });
+}
+
+template<unsigned vt, typename F>
+void sycl_thread_iterate(F f, unsigned tid) {
+  sycl_iterate<vt>([=](auto i) { f(i, vt * tid + i); });
+}
+
+// ----------------------------------------------------------------------------
+// syclRange
+// ----------------------------------------------------------------------------
+
+// syclRange
+struct syclRange {
+  unsigned begin, end;
+  unsigned size() const { return end - begin; }
+  unsigned count() const { return size(); }
+  bool valid() const { return end > begin; }
+};
+
+inline syclRange sycl_get_tile(unsigned b, unsigned nv, unsigned count) {
+  return syclRange { nv * b, std::min(count, nv * (b + 1)) };
+}
+
+
+// ----------------------------------------------------------------------------
+// syclArray
+// ----------------------------------------------------------------------------
+
+template<typename T, unsigned size>
+struct syclArray {
+  T data[size];
+
+  T operator[](unsigned i) const { return data[i]; }
+  T& operator[](unsigned i) { return data[i]; }
+
+  syclArray() = default;
+  syclArray(const syclArray&) = default;
+  syclArray& operator=(const syclArray&) = default;
+
+  // Fill the array with x.
+  syclArray(T x) {
+    sycl_iterate<size>([&](unsigned i) { data[i] = x; });
+  }
+};
+
+template<typename T>
+struct syclArray<T, 0> {
+  T operator[](unsigned) const { return T(); }
+  T& operator[](unsigned) { return *(T*)nullptr; }
+};
+
+template<typename T, typename V, unsigned size>
+struct syclKVArray {
+  syclArray<T, size> keys;
+  syclArray<V, size> vals;
+};
+
+// ----------------------------------------------------------------------------
+// thread reg <-> global mem
+// ----------------------------------------------------------------------------
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I>
+auto sycl_mem_to_reg_strided(I mem, unsigned tid, unsigned count) {
+  using T = typename std::iterator_traits<I>::value_type;
+  syclArray<T, vt> x;
+  sycl_strided_iterate<nt, vt, vt0>(
+    [&](auto i, auto j) { x[i] = mem[j]; }, tid, count
+  );
+  return x;
+}
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t>
+void sycl_reg_to_mem_strided(
+  syclArray<T, vt> x, unsigned tid, unsigned count, it_t mem) {
+
+  sycl_strided_iterate<nt, vt, vt0>(
+    [=](auto i, auto j) { mem[j] = x[i]; }, tid, count
+  );
+}
+
+template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O>
+auto sycl_transform_mem_to_reg_strided(
+  I mem, unsigned tid, unsigned count, O op
+) {
+  using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>;
+  syclArray<T, vt> x;
+  sycl_strided_iterate<nt, vt, vt0>(
+    [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count
+  );
+  return x;
+}
+
+// ----------------------------------------------------------------------------
+// thread reg <-> shared
+// ----------------------------------------------------------------------------
+
+//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+//void sycl_reg_to_shared_thread(
+//  syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
+//) {
+//
+//  static_assert(shared_size >= nt * vt,
+//    "reg_to_shared_thread must have at least nt * vt storage");
+//
+//  sycl_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid);
+//
+//  if(sync) __syncthreads();
+//}
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+//auto sycl_shared_to_reg_thread(
+//  const T (&shared)[shared_size], unsigned tid, bool sync = true
+//) {
+//
+//  static_assert(shared_size >= nt * vt,
+//    "reg_to_shared_thread must have at least nt * vt storage");
+//
+//  syclArray<T, vt> x;
+//  sycl_thread_iterate<vt>([&](auto i, auto j) {
+//    x[i] = shared[j];
+//  }, tid);
+//
+//  if(sync) __syncthreads();
+//
+//  return x;
+//}
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+//void sycl_reg_to_shared_strided(
+//  syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
+//) {
+//
+//  static_assert(shared_size >= nt * vt,
+//    "reg_to_shared_strided must have at least nt * vt storage");
+//
+//  sycl_strided_iterate<nt, vt>(
+//    [&](auto i, auto j) { shared[j] = x[i]; }, tid
+//  );
+//
+//  if(sync) __syncthreads();
+//}
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
+//auto sycl_shared_to_reg_strided(
+//  const T (&shared)[shared_size], unsigned tid, bool sync = true
+//) {
+//
+//  static_assert(shared_size >= nt * vt,
+//    "shared_to_reg_strided must have at least nt * vt storage");
+//
+//  syclArray<T, vt> x;
+//  sycl_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid);
+//  if(sync) __syncthreads();
+//
+//  return x;
+//}
+//
+//template<
+//  unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
+//  unsigned shared_size
+//>
+//auto sycl_reg_to_mem_thread(
+//  syclArray<T, vt> x, unsigned tid,
+//  unsigned count, it_t mem, T (&shared)[shared_size]
+//) {
+//  sycl_reg_to_shared_thread<nt>(x, tid, shared);
+//  auto y = sycl_shared_to_reg_strided<nt, vt>(shared, tid);
+//  sycl_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
+//}
+//
+//template<
+//  unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
+//  unsigned shared_size
+//>
+//auto sycl_mem_to_reg_thread(
+//  it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size]
+//) {
+//
+//  auto x = sycl_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
+//  sycl_reg_to_shared_strided<nt, vt>(x, tid, shared);
+//  auto y = sycl_shared_to_reg_thread<nt, vt>(shared, tid);
+//  return y;
+//}
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned S>
+//auto sycl_shared_gather(
+//  const T(&data)[S], syclArray<unsigned, vt> indices, bool sync = true
+//) {
+//
+//  static_assert(S >= nt * vt,
+//    "shared_gather must have at least nt * vt storage");
+//
+//  syclArray<T, vt> x;
+//  sycl_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; });
+//
+//  if(sync) __syncthreads();
+//
+//  return x;
+//}
+//
+//
+//
+//// ----------------------------------------------------------------------------
+//// reg<->reg
+//// ----------------------------------------------------------------------------
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned S>
+//auto sycl_reg_thread_to_strided(
+//  syclArray<T, vt> x, unsigned tid, T (&shared)[S]
+//) {
+//  sycl_reg_to_shared_thread<nt>(x, tid, shared);
+//  return sycl_shared_to_reg_strided<nt, vt>(shared, tid);
+//}
+//
+//template<unsigned nt, unsigned vt, typename T, unsigned S>
+//auto sycl_reg_strided_to_thread(
+//  syclArray<T, vt> x, unsigned tid, T (&shared)[S]
+//) {
+//  sycl_reg_to_shared_strided<nt>(x, tid, shared);
+//  return sycl_shared_to_reg_thread<nt, vt>(shared, tid);
+//}
+
+// ----------------------------------------------------------------------------
+// syclLoadStoreIterator
+// ----------------------------------------------------------------------------
+
+template<typename L, typename S, typename T, typename I>
+struct syclLoadStoreIterator : std::iterator_traits<const T*> {
+
+  L load;
+  S store;
+  I base;
+
+  syclLoadStoreIterator(L load_, S store_, I base_) :
+    load(load_), store(store_), base(base_) { }
+
+  struct assign_t {
+    L load;
+    S store;
+    I index;
+
+    assign_t& operator=(T rhs) {
+      static_assert(!std::is_same<S, syclEmpty>::value,
+        "load_iterator is being stored to.");
+      store(rhs, index);
+      return *this;
+    }
+    operator T() const {
+      static_assert(!std::is_same<L, syclEmpty>::value,
+        "store_iterator is being loaded from.");
+      return load(index);
+    }
+  };
+
+  assign_t operator[](I index) const {
+    return assign_t { load, store, base + index };
+  }
+  assign_t operator*() const {
+    return assign_t { load, store, base };
+  }
+
+  syclLoadStoreIterator operator+(I offset) const {
+    syclLoadStoreIterator cp = *this;
+    cp += offset;
+    return cp;
+  }
+
+  syclLoadStoreIterator& operator+=(I offset) {
+    base += offset;
+    return *this;
+  }
+
+  syclLoadStoreIterator operator-(I offset) const {
+    syclLoadStoreIterator cp = *this;
+    cp -= offset;
+    return cp;
+  }
+
+  syclLoadStoreIterator& operator-=(I offset) {
+    base -= offset;
+    return *this;
+  }
+};
+
+//template<typename T>
+//struct trivial_load_functor {
+//  template<typename I>
+//  T operator()(I index) const {
+//    return T();
+//  }
+//};
+
+//template<typename T>
+//struct trivial_store_functor {
+//  template<typename I>
+//  void operator()(T v, I index) const { }
+//};
+
+template <typename T, typename I = int, typename L, typename S>
+auto sycl_make_load_store_iterator(L load, S store, I base = 0) {
+  return syclLoadStoreIterator<L, S, T, I>(load, store, base);
+}
+
+template <typename T, typename I = int, typename L>
+auto sycl_make_load_iterator(L load, I base = 0) {
+  return sycl_make_load_store_iterator<T>(load, syclEmpty(), base);
+}
+
+template <typename T, typename I = int, typename S>
+auto sycl_make_store_iterator(S store, I base = 0) {
+  return sycl_make_load_store_iterator<T>(syclEmpty(), store, base);
+}
+
+// ----------------------------------------------------------------------------
+// swap
+// ----------------------------------------------------------------------------
+
+template<typename T>
+void sycl_swap(T& a, T& b) {
+  auto c = a;
+  a = b;
+  b = c;
+}
+
+// ----------------------------------------------------------------------------
+// launch kernel
+// ----------------------------------------------------------------------------
+
+//template<typename F, typename... args_t>
+//__global__ void sycl_kernel(F f, args_t... args) {
+//  f(threadIdx.x, blockIdx.x, args...);
+//}
+
+// ----------------------------------------------------------------------------
+// operators
+// ----------------------------------------------------------------------------
+
+template <typename T>
+struct sycl_plus : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct sycl_minus : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct sycl_multiplies : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct sycl_maximum  : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a > b ? a : b; }
+};
+
+template <typename T>
+struct sycl_minimum  : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a < b ? a : b; }
+};
+
+template <typename T>
+struct sycl_less : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a < b; }
+};
+
+template <typename T>
+struct sycl_greater : public std::binary_function<T, T, T> {
+  T operator()(T a, T b) const { return a > b; }
+};
+
+// ----------------------------------------------------------------------------
+// Memory Object
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+template <typename T>
+class syclScopedDeviceMemory {
+
+  public:
+
+    syclScopedDeviceMemory() = delete;
+
+    syclScopedDeviceMemory(size_t N, sycl::queue& queue) : 
+      _queue {queue},
+      _N {N} {
+      if(N) {
+        _data = sycl::malloc_device<T>(N, _queue);
+      }
+    }
+
+    syclScopedDeviceMemory(syclScopedDeviceMemory&& rhs) :
+      _queue{std::move(rhs._queue)}, _data{rhs._data}, _N {rhs._N} {
+      rhs._data = nullptr;
+      rhs._N    = 0;
+    }
+
+    ~syclScopedDeviceMemory() {
+      if(_data) {
+        sycl::free(_data, _queue);
+      }
+    }
+
+    syclScopedDeviceMemory& operator = (syclScopedDeviceMemory&& rhs) {
+      if(_data) {
+        sycl::free(_data, _queue);
+      }
+      _queue = std::move(rhs._queue);
+      _data  = rhs._data;
+      _N     = rhs._N;
+      rhs._data = nullptr;
+      rhs._N    = 0;
+      return *this;
+    }
+
+    size_t size() const { return _N; }
+
+    T* data() { return _data; }
+    const T* data() const { return _data; }
+
+    syclScopedDeviceMemory(const syclScopedDeviceMemory&) = delete;
+    syclScopedDeviceMemory& operator = (const syclScopedDeviceMemory&) = delete;
+
+  private:
+
+    sycl::queue& _queue;
+
+    T* _data  {nullptr};
+    size_t _N {0};
+};
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/sycl/sycl_task.hpp b/myxpcs/include/taskflow_/sycl/sycl_task.hpp
new file mode 100644
index 0000000..ed83ef4
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/sycl_task.hpp
@@ -0,0 +1,209 @@
+#pragma once
+
+#include "sycl_graph.hpp"
+
+/**
+@file sycl_task.hpp
+@brief syclTask include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// syclTask
+// ----------------------------------------------------------------------------
+
+/**
+@class syclTask
+
+@brief handle to a node of the internal CUDA graph
+*/
+class syclTask {
+
+  friend class syclFlow;
+
+  friend std::ostream& operator << (std::ostream&, const syclTask&);
+
+  public:
+
+    /**
+    @brief constructs an empty syclTask
+    */
+    syclTask() = default;
+
+    /**
+    @brief copy-constructs a syclTask
+    */
+    syclTask(const syclTask&) = default;
+
+    /**
+    @brief copy-assigns a syclTask
+    */
+    syclTask& operator = (const syclTask&) = default;
+
+    /**
+    @brief adds precedence links from this to other tasks
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    syclTask& precede(Ts&&... tasks);
+
+    /**
+    @brief adds precedence links from other tasks to this
+
+    @tparam Ts parameter pack
+
+    @param tasks one or multiple tasks
+
+    @return @c *this
+    */
+    template <typename... Ts>
+    syclTask& succeed(Ts&&... tasks);
+
+    /**
+    @brief assigns a name to the task
+
+    @param name a @std_string acceptable string
+
+    @return @c *this
+    */
+    syclTask& name(const std::string& name);
+
+    /**
+    @brief queries the name of the task
+    */
+    const std::string& name() const;
+
+    /**
+    @brief queries the number of successors
+    */
+    size_t num_successors() const;
+
+    /**
+    @brief queries the number of dependents
+    */
+    size_t num_dependents() const;
+
+    /**
+    @brief queries if the task is associated with a syclNode
+    */
+    bool empty() const;
+
+    /**
+    @brief dumps the task through an output stream
+
+    @tparam T output stream type with insertion operator (<<) defined
+    @param ostream an output stream target
+    */
+    template <typename T>
+    void dump(T& ostream) const;
+
+    /**
+    @brief applies an visitor callable to each successor of the task
+    */
+    template <typename V>
+    void for_each_successor(V&& visitor) const;
+
+    /**
+    @brief applies an visitor callable to each dependents of the task
+    */
+    template <typename V>
+    void for_each_dependent(V&& visitor) const;
+
+  private:
+
+    syclTask(syclNode*);
+
+    syclNode* _node {nullptr};
+};
+
+// Constructor
+inline syclTask::syclTask(syclNode* node) : _node {node} {
+}
+
+// Function: precede
+template <typename... Ts>
+syclTask& syclTask::precede(Ts&&... tasks) {
+  (_node->_precede(tasks._node), ...);
+  return *this;
+}
+
+// Function: succeed
+template <typename... Ts>
+syclTask& syclTask::succeed(Ts&&... tasks) {
+  (tasks._node->_precede(_node), ...);
+  return *this;
+}
+
+// Function: empty
+inline bool syclTask::empty() const {
+  return _node == nullptr;
+}
+
+// Function: name
+inline syclTask& syclTask::name(const std::string& name) {
+  _node->_name = name;
+  return *this;
+}
+
+// Function: name
+inline const std::string& syclTask::name() const {
+  return _node->_name;
+}
+
+// Function: num_successors
+inline size_t syclTask::num_successors() const {
+  return _node->_successors.size();
+}
+
+// Function: num_dependents
+inline size_t syclTask::num_dependents() const {
+  return _node->_dependents.size();
+}
+
+// Procedure: dump
+template <typename T>
+void syclTask::dump(T& os) const {
+  os << "syclTask ";
+  if(_node->_name.empty()) os << _node;
+  else os << _node->_name;
+}
+
+// Function: for_each_successor
+template <typename V>
+void syclTask::for_each_successor(V&& visitor) const {
+  for(size_t i=0; i<_node->_successors.size(); ++i) {
+    visitor(syclTask(_node->_successors[i]));
+  }
+}
+
+// Function: for_each_dependent
+template <typename V>
+void syclTask::for_each_dependent(V&& visitor) const {
+  for(size_t i=0; i<_node->_dependents.size(); ++i) {
+    visitor(syclTask(_node->_dependents[i]));
+  }
+}
+
+
+// ----------------------------------------------------------------------------
+// global ostream
+// ----------------------------------------------------------------------------
+
+/**
+@brief overload of ostream inserter operator for syclTask
+*/
+inline std::ostream& operator << (std::ostream& os, const syclTask& ct) {
+  ct.dump(os);
+  return os;
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/sycl/syclflow.hpp b/myxpcs/include/taskflow_/sycl/syclflow.hpp
new file mode 100644
index 0000000..a2a0976
--- /dev/null
+++ b/myxpcs/include/taskflow_/sycl/syclflow.hpp
@@ -0,0 +1,684 @@
+#pragma once
+
+#include "../taskflow.hpp"
+#include "sycl_task.hpp"
+
+/**
+@file syclflow.hpp
+@brief main syclFlow include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// class definition: syclFlow
+// ----------------------------------------------------------------------------
+
+/**
+@class syclFlow
+
+@brief class for building a SYCL task dependency graph
+
+*/
+class syclFlow {
+
+  friend class Executor;
+
+  struct External {
+    syclGraph graph;
+  };
+
+  struct Internal {
+    Executor& executor;
+    Internal(Executor& e) : executor {e} {}
+  };
+
+  using handle_t = std::variant<External, Internal>;
+
+  public:
+   
+    /**
+    @brief constructs a standalone %syclFlow from the given queue
+
+    A standalone %syclFlow does not go through any taskflow and
+    can be run by the caller thread using explicit offload methods 
+    (e.g., tf::syclFlow::offload).
+    */
+    syclFlow(sycl::queue& queue);
+    
+    /**
+    @brief destroys the %syclFlow 
+     */
+    ~syclFlow() = default;
+
+    /**
+    @brief queries the emptiness of the graph
+    */
+    bool empty() const;
+
+    /**
+    @brief queries the number of tasks
+    */
+    size_t num_tasks() const;
+    
+    /**
+    @brief dumps the %syclFlow graph into a DOT format through an
+           output stream
+    */
+    void dump(std::ostream& os) const;
+
+    /**
+    @brief clear the associated graph
+    */
+    void clear();
+
+    // ------------------------------------------------------------------------
+    // Generic device operations
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief creates a task that launches the given command group function object
+
+    @tparam F type of command group function object
+    @param func function object that is constructible from 
+                std::function<void(sycl::handler&)>
+    
+    Creates a task that is associated from the given command group.
+    In SYCL, each command group function object is given a unique 
+    command group handler object to perform all the necessary work 
+    required to correctly process data on a device using a kernel.
+    */
+    template <typename F, std::enable_if_t<
+      std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr
+    >
+    syclTask on(F&& func);
+    
+    /**
+    @brief updates the task to the given command group function object
+
+    Similar to tf::syclFlow::on but operates on an existing task.
+    */
+    template <typename F, std::enable_if_t<
+      std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr
+    >
+    void on(syclTask task, F&& func);
+    
+    /**
+    @brief creates a memcpy task that copies untyped data in bytes
+    
+    @param tgt pointer to the target memory block
+    @param src pointer to the source memory block
+    @param bytes bytes to copy
+
+    @return a tf::syclTask handle
+
+    A memcpy task transfers @c bytes of data from a source locationA @c src
+    to a target location @c tgt. Both @c src and @c tgt may be either host 
+    or USM pointers.
+    */ 
+    syclTask memcpy(void* tgt, const void* src, size_t bytes);
+    
+    /**
+    @brief creates a memset task that fills untyped data with a byte value
+
+    @param ptr pointer to the destination device memory area
+    @param value value to set for each byte of specified memory
+    @param bytes number of bytes to set
+    
+    @return a tf::syclTask handle
+
+    Fills @c bytes of memory beginning at address @c ptr with @c value. 
+    @c ptr must be a USM allocation. 
+    @c value is interpreted as an unsigned char.
+    */
+    syclTask memset(void* ptr, int value, size_t bytes);
+    
+    /**
+    @brief creates a fill task that fills typed data with the given value
+
+    @tparam T trivially copyable value type
+
+    @param ptr pointer to the memory to fill
+    @param pattern pattern value to fill into the memory
+    @param count number of items to fill the value
+
+    Creates a task that fills the specified memory with the 
+    specified value.
+    */
+    template <typename T>
+    syclTask fill(void* ptr, const T& pattern, size_t count);
+    
+    /**
+    @brief creates a copy task that copies typed data from a source to a target
+           memory block
+
+    @tparam T trivially copyable value type
+    
+    @param target pointer to the memory to fill
+    @param source pointer to the pattern value to fill into the memory
+    @param count number of items to fill the value
+    
+    Creates a task that copies @c count items of type @c T from a source memory
+    location to a target memory location.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    syclTask copy(T* target, const T* source, size_t count);
+    
+    /**
+    @brief creates a kernel task
+
+    @tparam ArgsT arguments types
+
+    @param args arguments to forward to the parallel_for methods defined 
+                in the handler object
+
+    Creates a kernel task from a parallel_for method through the handler 
+    object associated with a command group.
+    */
+    template <typename...ArgsT>
+    syclTask parallel_for(ArgsT&&... args);
+    
+    // ------------------------------------------------------------------------
+    // algorithms
+    // ------------------------------------------------------------------------
+    
+    /**
+    @brief invokes a SYCL kernel function using only one thread
+
+    @tparam F kernel function type
+    @param func kernel function
+
+    Creates a task that launches the given function object using only one
+    kernel thread. 
+    */
+    template <typename F>
+    syclTask single_task(F&& func);
+    
+    /**
+    @brief applies a callable to each dereferenced element of the data array
+
+    @tparam I iterator type
+    @tparam C callable type
+
+    @param first iterator to the beginning (inclusive)
+    @param last iterator to the end (exclusive)
+    @param callable a callable object to apply to the dereferenced iterator 
+    
+    @return a tf::syclTask handle
+    
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+    
+    @code{.cpp}
+    for(auto itr = first; itr != last; itr++) {
+      callable(*itr);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    syclTask for_each(I first, I last, C&& callable);
+    
+    /**
+    @brief applies a callable to each index in the range with the step size
+    
+    @tparam I index type
+    @tparam C callable type
+    
+    @param first beginning index
+    @param last last index
+    @param step step size
+    @param callable the callable to apply to each element in the data array
+    
+    @return a tf::syclTask handle
+    
+    This method is equivalent to the parallel execution of the following loop on a GPU:
+    
+    @code{.cpp}
+    // step is positive [first, last)
+    for(auto i=first; i<last; i+=step) {
+      callable(i);
+    }
+
+    // step is negative [first, last)
+    for(auto i=first; i>last; i+=step) {
+      callable(i);
+    }
+    @endcode
+    */
+    template <typename I, typename C>
+    syclTask for_each_index(I first, I last, I step, C&& callable);
+    
+    /**
+    @brief applies a callable to a source range and stores the result in a target range
+    
+    @tparam I iterator type
+    @tparam C callable type
+    @tparam S source types
+
+    @param first iterator to the beginning (inclusive)
+    @param last iterator to the end (exclusive)
+    @param callable the callable to apply to each element in the range
+    @param srcs iterators to the source ranges
+    
+    @return a tf::syclTask handle
+    
+    This method is equivalent to the parallel execution of the following 
+    loop on a SYCL device:
+    
+    @code{.cpp}
+    while (first != last) {
+      *first++ = callable(*src1++, *src2++, *src3++, ...);
+    }
+    @endcode
+    */
+    template <typename I, typename C, typename... S>
+    syclTask transform(I first, I last, C&& callable, S... srcs);
+    
+    /**
+    @brief performs parallel reduction over a range of items
+    
+    @tparam I input iterator type
+    @tparam T value type
+    @tparam C callable type
+
+    @param first iterator to the beginning (inclusive)
+    @param last iterator to the end (exclusive)
+    @param result pointer to the result with an initialized value
+    @param op binary reduction operator
+    
+    @return a tf::syclTask handle
+    
+    This method is equivalent to the parallel execution of the following loop 
+    on a SYCL device:
+    
+    @code{.cpp}
+    while (first != last) {
+      *result = op(*result, *first++);
+    }
+    @endcode
+    */
+    template <typename I, typename T, typename C>
+    syclTask reduce(I first, I last, T* result, C&& op);
+    
+    /**
+    @brief similar to tf::syclFlow::reduce but does not assume any initial
+           value to reduce
+    
+    This method is equivalent to the parallel execution of the following loop 
+    on a SYCL device:
+    
+    @code{.cpp}
+    *result = *first++;  // no initial values partitipcate in the loop
+    while (first != last) {
+      *result = op(*result, *first++);
+    }
+    @endcode
+    */
+    template <typename I, typename T, typename C>
+    syclTask uninitialized_reduce(I first, I last, T* result, C&& op);
+    
+    // ------------------------------------------------------------------------
+    // offload methods
+    // ------------------------------------------------------------------------
+
+    /**
+    @brief offloads the %syclFlow onto a GPU and repeatedly runs it until 
+    the predicate becomes true
+    
+    @tparam P predicate type (a binary callable)
+
+    @param predicate a binary predicate (returns @c true for stop)
+
+    Repetitively executes the present %syclFlow through the given queue object
+    until the predicate returns @c true.
+
+    By default, if users do not offload the %syclFlow, 
+    the executor will offload it once.
+    */
+    template <typename P>
+    void offload_until(P&& predicate);
+    
+    /**
+    @brief offloads the %syclFlow and executes it by the given times
+
+    @param N number of executions
+    */
+    void offload_n(size_t N);
+
+    /**
+    @brief offloads the %syclFlow and executes it once
+    */
+    void offload();
+    
+    // ------------------------------------------------------------------------
+    // update methods
+    // ------------------------------------------------------------------------
+    
+
+    /**
+    @brief rebinds the task to a memcpy task
+    
+    Similar to tf::syclFlow::memcpy but operates on an existing task.
+    */
+    void memcpy(syclTask task, void* tgt, const void* src, size_t bytes);
+    
+    /**
+    @brief rebinds the task to a memset task
+    
+    Similar to tf::syclFlow::memset but operates on an existing task.
+    */
+    void memset(syclTask task, void* ptr, int value, size_t bytes);
+    
+    /**
+    @brief rebinds the task to a fill task
+
+    Similar to tf::syclFlow::fill but operates on an existing task.
+    */
+    template <typename T>
+    void fill(syclTask task, void* ptr, const T& pattern, size_t count);
+    
+    /**
+    @brief rebinds the task to a copy task
+
+    Similar to tf::syclFlow::copy but operates on an existing task.
+    */
+    template <typename T,
+      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
+    >
+    void copy(syclTask task, T* target, const T* source, size_t count);
+    
+    /**
+    @brief rebinds the task to a parallel-for kernel task
+    
+    Similar to tf::syclFlow::parallel_for but operates on an existing task.
+    */
+    template <typename...ArgsT>
+    void parallel_for(syclTask task, ArgsT&&... args);
+
+    /**
+    @brief rebinds the task to a single-threaded kernel task
+
+    Similar to tf::syclFlow::single_task but operates on an existing task.
+    */
+    template <typename F>
+    void single_task(syclTask task, F&& func);
+    
+  private:
+
+    syclFlow(Executor&, syclGraph&, sycl::queue&);
+    
+    sycl::queue& _queue;
+    
+    handle_t _handle;
+    
+    syclGraph& _graph;
+  
+    std::vector<syclNode*> _tpg;
+    std::queue<syclNode*> _bfs;
+};
+
+// constructor
+inline syclFlow::syclFlow(sycl::queue& queue) :
+  _queue  {queue}, 
+  _handle {std::in_place_type_t<External>{}},
+  _graph  {std::get_if<External>(&_handle)->graph} {
+}
+
+// Construct the syclFlow from executor (internal graph)
+inline syclFlow::syclFlow(Executor& e, syclGraph& g, sycl::queue& queue) :
+  _queue  {queue},
+  _handle {std::in_place_type_t<Internal>{}, e},
+  _graph  {g} {
+}
+
+// Function: empty
+inline bool syclFlow::empty() const {
+  return _graph._nodes.empty();
+}
+
+// Function: num_tasks
+inline size_t syclFlow::num_tasks() const {
+  return _graph._nodes.size();
+}
+
+// Procedure: dump
+inline void syclFlow::dump(std::ostream& os) const {
+  _graph.dump(os, nullptr, "");
+}
+
+// Procedure: clear
+inline void syclFlow::clear() {
+  _graph.clear();
+}
+
+// Function: memcpy
+inline syclTask syclFlow::memcpy(void* tgt, const void* src, size_t bytes) {
+  return on([=](sycl::handler& h){ h.memcpy(tgt, src, bytes); });
+}
+
+// Function: memset
+inline syclTask syclFlow::memset(void* ptr, int value, size_t bytes) {
+  return on([=](sycl::handler& h){ h.memset(ptr, value, bytes); });
+}
+
+// Function: fill
+template <typename T>
+syclTask syclFlow::fill(void* ptr, const T& pattern, size_t count) {
+  return on([=](sycl::handler& h){ h.fill(ptr, pattern, count); });
+}
+
+// Function: copy
+template <typename T,
+  std::enable_if_t<!std::is_same_v<T, void>, void>*
+>
+syclTask syclFlow::copy(T* target, const T* source, size_t count) {
+  return on([=](sycl::handler& h){ h.memcpy(target, source, count*sizeof(T)); });
+}
+
+// Function: on
+template <typename F, std::enable_if_t<
+  std::is_invocable_r_v<void, F, sycl::handler&>, void>*
+>
+syclTask syclFlow::on(F&& f) {
+  auto node = _graph.emplace_back(_graph, 
+    std::in_place_type_t<syclNode::CGH>{}, std::forward<F>(f)
+  );
+  return syclTask(node);
+}
+
+// Function: single_task
+template <typename F>
+syclTask syclFlow::single_task(F&& func) {
+  return on([f=std::forward<F>(func)] (sycl::handler& h) {
+    h.single_task(f);
+  });
+}
+
+// Function: parallel_for
+template <typename...ArgsT>
+syclTask syclFlow::parallel_for(ArgsT&&... args) {
+  return on([args...] (sycl::handler& h) { h.parallel_for(args...); });
+}
+
+// Procedure: offload_until
+template <typename P>
+void syclFlow::offload_until(P&& predicate) {
+  
+  if(!(_graph._state & syclGraph::TOPOLOGY_CHANGED)) {
+    goto offload;
+  }
+
+  // levelize the graph
+  _tpg.clear();
+
+  // insert the first level of nodes into the queue
+  for(auto& u : _graph._nodes) {
+    u->_level = u->_dependents.size();
+    if(u->_level == 0) {
+      _bfs.push(u.get());
+    }
+  }
+  
+  while(!_bfs.empty()) {
+    auto u = _bfs.front();
+    _bfs.pop();
+    _tpg.push_back(u);
+    for(auto v : u->_successors) {
+      if(--(v->_level) == 0) {
+        v->_level = u->_level + 1;
+        _bfs.push(v);
+      }
+    }
+  }
+
+  offload:
+  
+  // offload the syclFlow graph
+  bool in_order = _queue.is_in_order();
+  
+  while(!predicate()) {
+
+    // traverse node in a topological order
+    for(auto u : _tpg) {
+      
+      switch(u->_handle.index()) {
+        // task type 1: command group handler 
+        case syclNode::COMMAND_GROUP_HANDLER:
+          u->_event = _queue.submit([u, in_order](sycl::handler& h){
+            // wait on all predecessors
+            if(!in_order) {
+              for(auto p : u->_dependents) {
+                h.depends_on(p->_event);
+              }
+            }
+            std::get_if<syclNode::CGH>(&u->_handle)->work(h);
+          });
+        break;
+      }
+    }
+    
+    // synchronize the execution
+    _queue.wait();
+  }
+
+  _graph._state = syclGraph::OFFLOADED;
+}
+
+// Procedure: offload_n
+inline void syclFlow::offload_n(size_t n) {
+  offload_until([repeat=n] () mutable { return repeat-- == 0; });
+}
+
+// Procedure: offload
+inline void syclFlow::offload() {
+  offload_until([repeat=1] () mutable { return repeat-- == 0; });
+}
+
+// Function: on
+template <typename F, std::enable_if_t<
+  std::is_invocable_r_v<void, F, sycl::handler&>, void>*
+>
+void syclFlow::on(syclTask task, F&& f) {
+  std::get_if<syclNode::CGH>(&task._node->_handle)->work = 
+    std::forward<F>(f);
+}
+
+// Function: memcpy
+inline void syclFlow::memcpy(
+  syclTask task, void* tgt, const void* src, size_t bytes
+) {
+  on(task, [=](sycl::handler& h){ h.memcpy(tgt, src, bytes); });
+}
+
+// Function: memset
+inline void syclFlow::memset(
+  syclTask task, void* ptr, int value, size_t bytes
+) {
+  on(task, [=](sycl::handler& h){ h.memset(ptr, value, bytes); });
+}
+
+// Function: fill
+template <typename T>
+void syclFlow::fill(
+  syclTask task, void* ptr, const T& pattern, size_t count
+) {
+  on(task, [=](sycl::handler& h){ h.fill(ptr, pattern, count); });
+}
+
+// Function: copy
+template <typename T,
+  std::enable_if_t<!std::is_same_v<T, void>, void>*
+>
+void syclFlow::copy(
+  syclTask task, T* target, const T* source, size_t count
+) {
+  on(task, [=](sycl::handler& h){ 
+    h.memcpy(target, source, count*sizeof(T));}
+  );
+}
+
+// Function: parallel_for
+template <typename...ArgsT>
+void syclFlow::parallel_for(syclTask task, ArgsT&&... args) {
+  on(task, [args...] (sycl::handler& h) { h.parallel_for(args...); });
+}
+    
+// Function: single_task
+template <typename F>
+void syclFlow::single_task(syclTask task, F&& func) {
+  on(task, [f=std::forward<F>(func)] (sycl::handler& h) { h.single_task(f); });
+}
+
+// ############################################################################
+// Forward declaration: FlowBuilder
+// ############################################################################
+    
+// FlowBuilder::emplace_on
+template <typename C, typename Q, std::enable_if_t<is_syclflow_task_v<C>, void>*>
+Task FlowBuilder::emplace_on(C&& callable, Q&& q) {
+  auto n = _graph._emplace_back(
+    std::in_place_type_t<Node::syclFlow>{},
+    [c=std::forward<C>(callable), queue=std::forward<Q>(q)] 
+    (Executor& e, Node* p) mutable {
+      e._invoke_syclflow_task_entry(p, c, queue);
+    },
+    std::make_unique<syclGraph>()
+  );
+  return Task(n);
+}
+
+// FlowBuilder::emplace
+template <typename C, std::enable_if_t<is_syclflow_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& callable) {
+  return emplace_on(std::forward<C>(callable), sycl::queue{});
+}
+
+// ############################################################################
+// Forward declaration: Executor
+// ############################################################################
+
+// Procedure: _invoke_syclflow_task_entry (syclFlow)
+template <typename C, typename Q,
+  std::enable_if_t<is_syclflow_task_v<C>, void>*
+>
+void Executor::_invoke_syclflow_task_entry(Node* node, C&& c, Q& queue) {
+
+  auto h = std::get_if<Node::syclFlow>(&node->_handle);
+
+  syclGraph* g = dynamic_cast<syclGraph*>(h->graph.get());
+  
+  g->clear();
+
+  syclFlow sf(*this, *g, queue);
+
+  c(sf); 
+
+  if(!(g->_state & syclGraph::OFFLOADED)) {
+    sf.offload();
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+    
+
diff --git a/myxpcs/include/taskflow_/taskflow.hpp b/myxpcs/include/taskflow_/taskflow.hpp
new file mode 100644
index 0000000..c2403f8
--- /dev/null
+++ b/myxpcs/include/taskflow_/taskflow.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "core/executor.hpp"
+#include "core/async.hpp"
+#include "algorithm/critical.hpp"
+
+/**
+@dir taskflow
+@brief root taskflow include dir
+*/
+
+/**
+@dir taskflow/core
+@brief taskflow core include dir
+*/
+
+/**
+@dir taskflow/algorithm
+@brief taskflow algorithms include dir
+*/
+
+/**
+@dir taskflow/cuda
+@brief taskflow CUDA include dir
+*/
+
+/**
+@file taskflow/taskflow.hpp
+@brief main taskflow include file
+*/
+
+// TF_VERSION % 100 is the patch level
+// TF_VERSION / 100 % 1000 is the minor version
+// TF_VERSION / 100000 is the major version
+
+// current version: 3.7.0
+#define TF_VERSION 300700
+
+#define TF_MAJOR_VERSION TF_VERSION/100000
+#define TF_MINOR_VERSION TF_VERSION/100%1000
+#define TF_PATCH_VERSION TF_VERSION%100
+
+/**
+@brief taskflow namespace
+*/
+namespace tf {
+
+/**
+@private
+*/
+namespace detail { }
+
+
+/**
+@brief queries the version information in a string format @c major.minor.patch
+
+Release notes are available here: https://taskflow.github.io/taskflow/Releases.html
+*/
+constexpr const char* version() {
+  return "3.7.0";
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/utility/iterator.hpp b/myxpcs/include/taskflow_/utility/iterator.hpp
new file mode 100644
index 0000000..8636a3b
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/iterator.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace tf {
+
+template <typename T>
+constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, bool>
+is_range_invalid(T beg, T end, T step) {
+  return ((step == 0 && beg != end) ||
+          (beg < end && step <=  0) ||  // positive range
+          (beg > end && step >=  0));   // negative range
+}
+
+template <typename T>
+constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, size_t>
+distance(T beg, T end, T step) {
+  return (end - beg + step + (step > 0 ? -1 : 1)) / step;
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/utility/macros.hpp b/myxpcs/include/taskflow_/utility/macros.hpp
new file mode 100644
index 0000000..e7598cf
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/macros.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#if defined(_MSC_VER)
+  #define TF_FORCE_INLINE __forceinline
+#elif defined(__GNUC__) && __GNUC__ > 3
+  #define TF_FORCE_INLINE __attribute__((__always_inline__)) inline
+#else
+  #define TF_FORCE_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+  #define TF_NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__) && __GNUC__ > 3
+  #define TF_NO_INLINE __attribute__((__noinline__))
+#else
+  #define TF_NO_INLINE
+#endif
diff --git a/myxpcs/include/taskflow_/utility/math.hpp b/myxpcs/include/taskflow_/utility/math.hpp
new file mode 100644
index 0000000..f80053e
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/math.hpp
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <atomic>
+
+namespace tf {
+
+// rounds the given 64-bit unsigned integer to the nearest power of 2
+template <typename T, std::enable_if_t<
+  (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 8) , void
+>* = nullptr>
+constexpr T next_pow2(T x) {
+  if(x == 0) return 1;
+  x--;
+  x |= x>>1;
+	x |= x>>2;
+	x |= x>>4;
+	x |= x>>8;
+	x |= x>>16;
+	x |= x>>32;
+  x++;
+  return x;
+}
+
+// rounds the given 32-bit unsigned integer to the nearest power of 2
+template <typename T, std::enable_if_t<
+  (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 4), void
+>* = nullptr>
+constexpr T next_pow2(T x) {
+  if(x == 0) return 1;
+  x--;
+  x |= x>>1;
+	x |= x>>2;
+	x |= x>>4;
+	x |= x>>8;
+	x |= x>>16;
+  x++;
+  return x;
+}
+
+// checks if the given number if a power of 2
+template <typename T, std::enable_if_t<
+  std::is_integral_v<std::decay_t<T>>, void>* = nullptr
+>
+constexpr bool is_pow2(const T& x) {
+  return x && (!(x&(x-1)));
+}
+
+//// finds the ceil of x divided by b
+//template <typename T, std::enable_if_t<
+//  std::is_integral_v<std::decay_t<T>>, void>* = nullptr
+//>
+//constexpr T ceil(const T& x, const T& y) {
+//  //return (x + y - 1) / y;
+//  return (x-1) / y + 1;
+//}
+
+/**
+@brief returns floor(log2(n)), assumes n > 0
+*/
+template<typename T>
+constexpr int log2(T n) {
+  int log = 0;
+  while (n >>= 1) {
+    ++log;
+  }
+  return log;
+}
+
+/**
+@brief finds the median of three numbers of dereferenced iterators using
+       the given comparator
+*/
+template <typename RandItr, typename C>
+RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) {
+  return cmp(*l, *m) ? (cmp(*m, *r) ? m : (cmp(*l, *r) ? r : l ))
+                     : (cmp(*r, *m) ? m : (cmp(*r, *l) ? r : l ));
+}
+
+/**
+@brief finds the pseudo median of a range of items using spreaded
+       nine numbers
+ */
+template <typename RandItr, typename C>
+RandItr pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) {
+  size_t N = std::distance(beg, end);
+  size_t offset = N >> 3;
+  return median_of_three(
+    median_of_three(beg, beg+offset, beg+(offset*2), cmp),
+    median_of_three(beg+(offset*3), beg+(offset*4), beg+(offset*5), cmp),
+    median_of_three(beg+(offset*6), beg+(offset*7), end-1, cmp),
+    cmp
+  );
+}
+
+/**
+@brief sorts two elements of dereferenced iterators using the given
+       comparison function
+*/
+template<typename Iter, typename Compare>
+void sort2(Iter a, Iter b, Compare comp) {
+  if (comp(*b, *a)) std::iter_swap(a, b);
+}
+
+/**
+@brief sorts three elements of dereferenced iterators using the given
+       comparison function
+*/
+template<typename Iter, typename Compare>
+void sort3(Iter a, Iter b, Iter c, Compare comp) {
+  sort2(a, b, comp);
+  sort2(b, c, comp);
+  sort2(a, b, comp);
+}
+
+/**
+@brief generates a program-wise unique id of the give type (thread-safe)
+*/
+template <typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
+T unique_id() {
+  static std::atomic<T> counter{0};
+  return counter.fetch_add(1, std::memory_order_relaxed);
+}
+
+/**
+@brief updates an atomic variable with a maximum value
+*/
+template <typename T>
+inline void atomic_max(std::atomic<T>& v, const T& max_v) noexcept {
+  T prev = v.load(std::memory_order_relaxed);
+  while(prev < max_v && 
+        !v.compare_exchange_weak(prev, max_v, std::memory_order_relaxed,
+                                              std::memory_order_relaxed)) {
+  }
+}
+
+/**
+@brief updates an atomic variable with a minimum value
+*/
+template <typename T>
+inline void atomic_min(std::atomic<T>& v, const T& min_v) noexcept {
+  T prev = v.load(std::memory_order_relaxed);
+  while(prev > min_v && 
+        !v.compare_exchange_weak(prev, min_v, std::memory_order_relaxed,
+                                              std::memory_order_relaxed)) {
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/utility/object_pool.hpp b/myxpcs/include/taskflow_/utility/object_pool.hpp
new file mode 100644
index 0000000..34d60fb
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/object_pool.hpp
@@ -0,0 +1,778 @@
+// 2020/03/13 - modified by Tsung-Wei Huang
+//  - fixed bug in aligning memory
+//
+// 2020/02/02 - modified by Tsung-Wei Huang
+//  - new implementation motivated by Hoard
+//
+// 2019/07/10 - modified by Tsung-Wei Huang
+//  - replace raw pointer with smart pointer
+//
+// 2019/06/13 - created by Tsung-Wei Huang
+//  - implemented an object pool class
+
+#pragma once
+
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <vector>
+#include <cassert>
+#include <cstddef>
+
+namespace tf {
+
+#define TF_ENABLE_POOLABLE_ON_THIS                          \
+  template <typename T, size_t S> friend class ObjectPool;  \
+  void* _object_pool_block
+
+// Class: ObjectPool
+//
+// The class implements an efficient thread-safe object pool motivated
+// by the Hoard memory allocator algorithm.
+// Different from the normal memory allocator, object pool allocates
+// only one object at a time.
+//
+// Internall, we use the following variables to maintain blocks and heaps:
+// X: size in byte of a item slot
+// M: number of items per block
+// F: emptiness threshold
+// B: number of bins per local heap (bin[B-1] is the full list)
+// W: number of items per bin
+// K: shrinkness constant
+//
+// Example scenario 1:
+// M = 30
+// F = 4
+// W = (30+4-1)/4 = 8
+//
+// b0: 0, 1, 2, 3, 4, 5, 6, 7
+// b1: 8, 9, 10, 11, 12, 13, 14, 15
+// b2: 16, 17, 18, 19, 20, 21, 22, 23
+// b3: 24, 25, 26, 27, 28, 29
+// b4: 30 (anything equal to M)
+//
+// Example scenario 2:
+// M = 32
+// F = 4
+// W = (32+4-1)/4 = 8
+// b0: 0, 1, 2, 3, 4, 5, 6, 7
+// b1: 8, 9, 10, 11, 12, 13, 14, 15
+// b2: 16, 17, 18, 19, 20, 21, 22, 23
+// b3: 24, 25, 26, 27, 28, 29, 30, 31
+// b4: 32 (anything equal to M)
+//
+template <typename T, size_t S = 65536>
+class ObjectPool {
+
+  // the data column must be sufficient to hold the pointer in freelist
+  constexpr static size_t X = (std::max)(sizeof(T*), sizeof(T));
+  //constexpr static size_t X = sizeof(long double) + std::max(sizeof(T*), sizeof(T));
+  //constexpr static size_t M = (S - offsetof(Block, data)) / X;
+  constexpr static size_t M = S / X;
+  constexpr static size_t F = 4;
+  constexpr static size_t B = F + 1;
+  constexpr static size_t W = (M + F - 1) / F;
+  constexpr static size_t K = 4;
+
+  static_assert(
+    S && (!(S & (S-1))), "block size S must be a power of two"
+  );
+
+  static_assert(
+    M >= 128, "block size S must be larger enough to pool at least 128 objects"
+  );
+
+  struct Blocklist {
+    Blocklist* prev;
+    Blocklist* next;
+  };
+
+  struct GlobalHeap {
+    std::mutex mutex;
+    Blocklist list;
+  };
+
+  struct LocalHeap {
+    std::mutex mutex;
+    Blocklist lists[B];
+    size_t u {0};
+    size_t a {0};
+  };
+
+  struct Block {
+    std::atomic<LocalHeap*> heap;
+    Blocklist list_node;
+    size_t i;
+    size_t u;
+    T* top;
+    // long double padding;
+    char data[S];
+  };
+
+  public:
+
+    /**
+    @brief constructs an object pool from a number of anticipated threads
+    */
+    explicit ObjectPool(unsigned = std::thread::hardware_concurrency());
+
+    /**
+    @brief destructs the object pool
+    */
+    ~ObjectPool();
+
+    /**
+    @brief acquires a pointer to a object constructed from a given argument list
+    */
+    template <typename... ArgsT>
+    T* animate(ArgsT&&... args);
+
+    /**
+    @brief recycles a object pointed by @c ptr and destroys it
+    */
+    void recycle(T* ptr);
+
+    size_t num_bins_per_local_heap() const;
+    size_t num_objects_per_bin() const;
+    size_t num_objects_per_block() const;
+    size_t num_available_objects() const;
+    size_t num_allocated_objects() const;
+    size_t capacity() const;
+    size_t num_local_heaps() const;
+    size_t num_global_heaps() const;
+    size_t num_heaps() const;
+
+    float emptiness_threshold() const;
+
+  private:
+
+    const size_t _lheap_mask;
+
+    GlobalHeap _gheap;
+
+    std::vector<LocalHeap> _lheaps;
+
+    LocalHeap& _this_heap();
+
+    constexpr unsigned _next_pow2(unsigned n) const;
+
+    template <class P, class Q>
+    constexpr size_t _offset_in_class(const Q P::*member) const;
+
+    template <class P, class Q>
+    constexpr P* _parent_class_of(Q*, const Q P::*member);
+
+    template <class P, class Q>
+    constexpr P* _parent_class_of(const Q*, const Q P::*member) const;
+
+    constexpr Block* _block_of(Blocklist*);
+    constexpr Block* _block_of(const Blocklist*) const;
+
+    size_t _bin(size_t) const;
+
+    T* _allocate(Block*);
+
+    void _deallocate(Block*, T*);
+    void _blocklist_init_head(Blocklist*);
+    void _blocklist_add_impl(Blocklist*, Blocklist*, Blocklist*);
+    void _blocklist_push_front(Blocklist*, Blocklist*);
+    void _blocklist_push_back(Blocklist*, Blocklist*);
+    void _blocklist_del_impl(Blocklist*, Blocklist*);
+    void _blocklist_del(Blocklist*);
+    void _blocklist_replace(Blocklist*, Blocklist*);
+    void _blocklist_move_front(Blocklist*, Blocklist*);
+    void _blocklist_move_back(Blocklist*, Blocklist*);
+    bool _blocklist_is_first(const Blocklist*, const Blocklist*);
+    bool _blocklist_is_last(const Blocklist*, const Blocklist*);
+    bool _blocklist_is_empty(const Blocklist*);
+    bool _blocklist_is_singular(const Blocklist*);
+
+    template <typename C>
+    void _for_each_block_safe(Blocklist*, C&&);
+
+    template <typename C>
+    void _for_each_block(Blocklist*, C&&);
+
+};
+
+// ----------------------------------------------------------------------------
+// ObjectPool definition
+// ----------------------------------------------------------------------------
+
+// Constructor
+template <typename T, size_t S>
+ObjectPool<T, S>::ObjectPool(unsigned t) :
+  //_heap_mask   {(_next_pow2(t) << 1) - 1u},
+  //_heap_mask   { _next_pow2(t<<1) - 1u },
+  //_heap_mask   {(t << 1) - 1},
+  _lheap_mask { _next_pow2((t+1) << 1) - 1 },
+  _lheaps     { _lheap_mask + 1 } {
+
+  _blocklist_init_head(&_gheap.list);
+
+  for(auto& h : _lheaps) {
+    for(size_t i=0; i<B; ++i) {
+      _blocklist_init_head(&h.lists[i]);
+    }
+  }
+}
+
+// Destructor
+template <typename T, size_t S>
+ObjectPool<T, S>::~ObjectPool() {
+
+  // clear local heaps
+  for(auto& h : _lheaps) {
+    for(size_t i=0; i<B; ++i) {
+      _for_each_block_safe(&h.lists[i], [] (Block* b) {
+        //std::free(b);
+        delete b;
+      });
+    }
+  }
+
+  // clear global heap
+  _for_each_block_safe(&_gheap.list, [] (Block* b) {
+    //std::free(b);
+    delete b;
+  });
+}
+
+// Function: num_bins_per_local_heap
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_bins_per_local_heap() const {
+  return B;
+}
+
+// Function: num_objects_per_bin
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_objects_per_bin() const {
+  return W;
+}
+
+// Function: num_objects_per_block
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_objects_per_block() const {
+  return M;
+}
+
+// Function: emptiness_threshold
+template <typename T, size_t S>
+float ObjectPool<T, S>::emptiness_threshold() const {
+  return 1.0f/F;
+}
+
+// Function: num_global_heaps
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_global_heaps() const {
+  return 1;
+}
+
+// Function: num_lheaps
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_local_heaps() const {
+  return _lheaps.size();
+}
+
+// Function: num_heaps
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_heaps() const {
+  return _lheaps.size() + 1;
+}
+
+// Function: capacity
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::capacity() const {
+
+  size_t n = 0;
+
+  // global heap
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
+    n += M;
+  };
+
+  // local heap
+  for(auto& h : _lheaps) {
+    n += h.a;
+  }
+
+  return n;
+}
+
+// Function: num_available_objects
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_available_objects() const {
+
+  size_t n = 0;
+
+  // global heap
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
+    n += (M - _block_of(p)->u);
+  };
+
+  // local heap
+  for(auto& h : _lheaps) {
+    n += (h.a - h.u);
+  }
+  return n;
+}
+
+// Function: num_allocated_objects
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::num_allocated_objects() const {
+
+  size_t n = 0;
+
+  // global heap
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
+    n += _block_of(p)->u;
+  };
+
+  // local heap
+  for(auto& h : _lheaps) {
+    n += h.u;
+  }
+  return n;
+}
+
+// Function: _bin
+template <typename T, size_t S>
+size_t ObjectPool<T, S>::_bin(size_t u) const {
+  return u == M ? F : u/W;
+}
+
+// Function: _offset_in_class
+template <typename T, size_t S>
+template <class P, class Q>
+constexpr size_t ObjectPool<T, S>::_offset_in_class(
+  const Q P::*member) const {
+  return (size_t) &( reinterpret_cast<P*>(0)->*member);
+}
+
+// C macro: parent_class_of(list_pointer, Block, list)
+// C++: parent_class_of(list_pointer,  &Block::list)
+template <typename T, size_t S>
+template <class P, class Q>
+constexpr P* ObjectPool<T, S>::_parent_class_of(
+  Q* ptr, const Q P::*member
+) {
+  return (P*)( (char*)ptr - _offset_in_class(member));
+}
+
+// Function: _parent_class_of
+template <typename T, size_t S>
+template <class P, class Q>
+constexpr P* ObjectPool<T, S>::_parent_class_of(
+  const Q* ptr, const Q P::*member
+) const {
+  return (P*)( (char*)ptr - _offset_in_class(member));
+}
+
+// Function: _block_of
+template <typename T, size_t S>
+constexpr typename ObjectPool<T, S>::Block*
+ObjectPool<T, S>::_block_of(Blocklist* list) {
+  return _parent_class_of(list, &Block::list_node);
+}
+
+// Function: _block_of
+template <typename T, size_t S>
+constexpr typename ObjectPool<T, S>::Block*
+ObjectPool<T, S>::_block_of(const Blocklist* list) const {
+  return _parent_class_of(list, &Block::list_node);
+}
+
+// Procedure: initialize a list head
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_init_head(Blocklist *list) {
+  list->next = list;
+  list->prev = list;
+}
+
+// Procedure: _blocklist_add_impl
+// Insert a new entry between two known consecutive entries.
+//
+// This is only for internal list manipulation where we know
+// the prev/next entries already!
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_add_impl(
+  Blocklist *curr, Blocklist *prev, Blocklist *next
+) {
+  next->prev = curr;
+  curr->next = next;
+  curr->prev = prev;
+  prev->next = curr;
+}
+
+// list_push_front - add a new entry
+// @curr: curr entry to be added
+// @head: list head to add it after
+//
+// Insert a new entry after the specified head.
+// This is good for implementing stacks.
+//
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_push_front(
+  Blocklist *curr, Blocklist *head
+) {
+  _blocklist_add_impl(curr, head, head->next);
+}
+
+// list_add_tail - add a new entry
+// @curr: curr entry to be added
+// @head: list head to add it before
+//
+// Insert a new entry before the specified head.
+// This is useful for implementing queues.
+//
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_push_back(
+  Blocklist *curr, Blocklist *head
+) {
+  _blocklist_add_impl(curr, head->prev, head);
+}
+
+// Delete a list entry by making the prev/next entries
+// point to each other.
+//
+// This is only for internal list manipulation where we know
+// the prev/next entries already!
+//
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_del_impl(
+  Blocklist * prev, Blocklist * next
+) {
+  next->prev = prev;
+  prev->next = next;
+}
+
+// _blocklist_del - deletes entry from list.
+// @entry: the element to delete from the list.
+// Note: list_empty() on entry does not return true after this, the entry is
+// in an undefined state.
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_del(Blocklist *entry) {
+  _blocklist_del_impl(entry->prev, entry->next);
+  entry->next = nullptr;
+  entry->prev = nullptr;
+}
+
+// list_replace - replace old entry by new one
+// @old : the element to be replaced
+// @curr : the new element to insert
+//
+// If @old was empty, it will be overwritten.
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_replace(
+  Blocklist *old, Blocklist *curr
+) {
+  curr->next = old->next;
+  curr->next->prev = curr;
+  curr->prev = old->prev;
+  curr->prev->next = curr;
+}
+
+// list_move - delete from one list and add as another's head
+// @list: the entry to move
+// @head: the head that will precede our entry
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_move_front(
+  Blocklist *list, Blocklist *head
+) {
+  _blocklist_del_impl(list->prev, list->next);
+  _blocklist_push_front(list, head);
+}
+
+// list_move_tail - delete from one list and add as another's tail
+// @list: the entry to move
+// @head: the head that will follow our entry
+template <typename T, size_t S>
+void ObjectPool<T, S>::_blocklist_move_back(
+  Blocklist *list, Blocklist *head
+) {
+  _blocklist_del_impl(list->prev, list->next);
+  _blocklist_push_back(list, head);
+}
+
+// list_is_first - tests whether @list is the last entry in list @head
+// @list: the entry to test
+// @head: the head of the list
+template <typename T, size_t S>
+bool ObjectPool<T, S>::_blocklist_is_first(
+  const Blocklist *list, const Blocklist *head
+) {
+  return list->prev == head;
+}
+
+// list_is_last - tests whether @list is the last entry in list @head
+// @list: the entry to test
+// @head: the head of the list
+template <typename T, size_t S>
+bool ObjectPool<T, S>::_blocklist_is_last(
+  const Blocklist *list, const Blocklist *head
+) {
+  return list->next == head;
+}
+
+// list_empty - tests whether a list is empty
+// @head: the list to test.
+template <typename T, size_t S>
+bool ObjectPool<T, S>::_blocklist_is_empty(const Blocklist *head) {
+  return head->next == head;
+}
+
+// list_is_singular - tests whether a list has just one entry.
+// @head: the list to test.
+template <typename T, size_t S>
+bool ObjectPool<T, S>::_blocklist_is_singular(
+  const Blocklist *head
+) {
+  return !_blocklist_is_empty(head) && (head->next == head->prev);
+}
+
+// Procedure: _for_each_block
+template <typename T, size_t S>
+template <typename C>
+void ObjectPool<T, S>::_for_each_block(Blocklist* head, C&& c) {
+  Blocklist* p;
+  for(p=head->next; p!=head; p=p->next) {
+    c(_block_of(p));
+  }
+}
+
+// Procedure: _for_each_block_safe
+// Iterate each item of a list - safe to free
+template <typename T, size_t S>
+template <typename C>
+void ObjectPool<T, S>::_for_each_block_safe(Blocklist* head, C&& c) {
+  Blocklist* p;
+  Blocklist* t;
+  for(p=head->next, t=p->next; p!=head; p=t, t=p->next) {
+    c(_block_of(p));
+  }
+}
+
+// Function: _allocate
+// allocate a spot from the block
+template <typename T, size_t S>
+T* ObjectPool<T, S>::_allocate(Block* s) {
+  if(s->top == nullptr) {
+    return reinterpret_cast<T*>(s->data + s->i++ * X);
+  }
+  else {
+    T* retval = s->top;
+    s->top = *(reinterpret_cast<T**>(s->top));
+    return retval;
+  }
+}
+
+// Procedure: _deallocate
+template <typename T, size_t S>
+void ObjectPool<T, S>::_deallocate(Block* s, T* ptr) {
+  *(reinterpret_cast<T**>(ptr)) = s->top;
+  s->top = ptr;
+}
+
+// Function: allocate
+template <typename T, size_t S>
+template <typename... ArgsT>
+T* ObjectPool<T, S>::animate(ArgsT&&... args) {
+
+  //std::cout << "construct a new item\n";
+
+  // my logically mapped heap
+  LocalHeap& h = _this_heap();
+
+  Block* s {nullptr};
+
+  h.mutex.lock();
+
+  // scan the list of superblocks from the most full to the least full
+  int f = static_cast<int>(F-1);
+  for(; f>=0; f--) {
+    if(!_blocklist_is_empty(&h.lists[f])) {
+      s = _block_of(h.lists[f].next);
+      break;
+    }
+  }
+
+  // no superblock found
+  if(f == -1) {
+
+    // check heap 0 for a superblock
+    _gheap.mutex.lock();
+    if(!_blocklist_is_empty(&_gheap.list)) {
+
+      s = _block_of(_gheap.list.next);
+
+      //printf("get a superblock from global heap %lu\n", s->u);
+      assert(s->u < M && s->heap == nullptr);
+      f = static_cast<int>(_bin(s->u + 1));
+
+      _blocklist_move_front(&s->list_node, &h.lists[f]);
+
+      s->heap = &h;  // must be within the global heap lock
+      _gheap.mutex.unlock();
+
+      h.u = h.u + s->u;
+      h.a = h.a + M;
+    }
+    // create a new block
+    else {
+      //printf("create a new superblock\n");
+      _gheap.mutex.unlock();
+      f = 0;
+      //s = static_cast<Block*>(std::malloc(sizeof(Block)));
+      s = new Block();
+
+      if(s == nullptr) {
+        throw std::bad_alloc();
+      }
+
+      s->heap = &h;
+      s->i = 0;
+      s->u = 0;
+      s->top = nullptr;
+
+      _blocklist_push_front(&s->list_node, &h.lists[f]);
+
+      h.a = h.a + M;
+    }
+  }
+
+  // the superblock must have at least one space
+  //assert(s->u < M);
+  //printf("%lu %lu %lu\n", h.u, h.a, s->u);
+  //assert(h.u < h.a);
+
+  h.u = h.u + 1;
+  s->u = s->u + 1;
+
+  // take one item from the superblock
+  T* mem = _allocate(s);
+
+  int b = static_cast<int>(_bin(s->u));
+
+  if(b != f) {
+    //printf("move superblock from list[%d] to list[%d]\n", f, b);
+    _blocklist_move_front(&s->list_node, &h.lists[b]);
+  }
+
+  //std::cout << "s.i " << s->i << '\n'
+  //          << "s.u " << s->u << '\n'
+  //          << "h.u " << h.u  << '\n'
+  //          << "h.a " << h.a  << '\n';
+
+  h.mutex.unlock();
+
+  //printf("allocate %p (s=%p)\n", mem, s);
+
+  new (mem) T(std::forward<ArgsT>(args)...);
+
+  mem->_object_pool_block = s;
+
+  return mem;
+}
+
+// Function: destruct
+template <typename T, size_t S>
+void ObjectPool<T, S>::recycle(T* mem) {
+
+  //Block* s = *reinterpret_cast<Block**>(
+  //  reinterpret_cast<char*>(mem) - sizeof(Block**)
+  //);
+
+  //Block* s= *(reinterpret_cast<Block**>(mem) - O); //  (mem) - 1
+
+  Block* s = static_cast<Block*>(mem->_object_pool_block);
+
+  mem->~T();
+
+  //printf("deallocate %p (s=%p) M=%lu W=%lu X=%lu\n", mem, s, M, W, X);
+
+  // here we need a loop because when we lock the heap,
+  // other threads may have removed the superblock to another heap
+  bool sync = false;
+
+  do {
+    LocalHeap* h = s->heap.load(std::memory_order_relaxed);
+
+    // the block is in global heap
+    if(h == nullptr) {
+      std::lock_guard<std::mutex> glock(_gheap.mutex);
+      if(s->heap == h) {
+        sync = true;
+        _deallocate(s, mem);
+        s->u = s->u - 1;
+      }
+    }
+    else {
+      std::lock_guard<std::mutex> llock(h->mutex);
+      if(s->heap == h) {
+        sync = true;
+        // deallocate the item from the superblock
+        size_t f = _bin(s->u);
+        _deallocate(s, mem);
+        s->u = s->u - 1;
+        h->u = h->u - 1;
+
+        size_t b = _bin(s->u);
+
+        if(b != f) {
+          //printf("move superblock from list[%d] to list[%d]\n", f, b);
+          _blocklist_move_front(&s->list_node, &h->lists[b]);
+        }
+
+        // transfer a mostly-empty superblock to global heap
+        if((h->u + K*M < h->a) && (h->u < ((F-1) * h->a / F))) {
+          for(size_t i=0; i<F; i++) {
+            if(!_blocklist_is_empty(&h->lists[i])) {
+              Block* x = _block_of(h->lists[i].next);
+              //printf("transfer a block (x.u=%lu/x.i=%lu) to the global heap\n", x->u, x->i);
+              assert(h->u > x->u && h->a > M);
+              h->u = h->u - x->u;
+              h->a = h->a - M;
+              x->heap = nullptr;
+              std::lock_guard<std::mutex> glock(_gheap.mutex);
+              _blocklist_move_front(&x->list_node, &_gheap.list);
+              break;
+            }
+          }
+        }
+      }
+    }
+  } while(!sync);
+
+  //std::cout << "s.i " << s->i << '\n'
+  //          << "s.u " << s->u << '\n';
+}
+
+// Function: _this_heap
+template <typename T, size_t S>
+typename ObjectPool<T, S>::LocalHeap&
+ObjectPool<T, S>::_this_heap() {
+  // here we don't use thread local since object pool might be
+  // created and destroyed multiple times
+  //thread_local auto hv = std::hash<std::thread::id>()(std::this_thread::get_id());
+  //return _lheaps[hv & _lheap_mask];
+
+  return _lheaps[
+    std::hash<std::thread::id>()(std::this_thread::get_id()) & _lheap_mask
+  ];
+}
+
+// Function: _next_pow2
+template <typename T, size_t S>
+constexpr unsigned ObjectPool<T, S>::_next_pow2(unsigned n) const {
+  if(n == 0) return 1;
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+  return n;
+}
+
+}  // end namespace tf --------------------------------------------------------
diff --git a/myxpcs/include/taskflow_/utility/os.hpp b/myxpcs/include/taskflow_/utility/os.hpp
new file mode 100644
index 0000000..23ac301
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/os.hpp
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+
+#define TF_OS_LINUX 0
+#define TF_OS_DRAGONFLY 0
+#define TF_OS_FREEBSD 0
+#define TF_OS_NETBSD 0
+#define TF_OS_OPENBSD 0
+#define TF_OS_DARWIN 0
+#define TF_OS_WINDOWS 0
+#define TF_OS_CNK 0
+#define TF_OS_HURD 0
+#define TF_OS_SOLARIS 0
+#define TF_OS_UNIX 0
+
+#ifdef _WIN32
+#undef TF_OS_WINDOWS
+#define TF_OS_WINDOWS 1
+#endif
+
+#ifdef __CYGWIN__
+#undef TF_OS_WINDOWS
+#define TF_OS_WINDOWS 1
+#endif
+
+#if (defined __APPLE__ && defined __MACH__)
+#undef TF_OS_DARWIN
+#define TF_OS_DARWIN 1
+#endif
+
+// in some ppc64 linux installations, only the second condition is met
+#if (defined __linux)
+#undef TF_OS_LINUX
+#define TF_OS_LINUX 1
+#elif (defined __linux__)
+#undef TF_OS_LINUX
+#define TF_OS_LINUX 1
+#else
+#endif
+
+#if (defined __DragonFly__)
+#undef TF_OS_DRAGONFLY
+#define TF_OS_DRAGONFLY 1
+#endif
+
+#if (defined __FreeBSD__)
+#undef TF_OS_FREEBSD
+#define TF_OS_FREEBSD 1
+#endif
+
+#if (defined __NetBSD__)
+#undef TF_OS_NETBSD
+#define TF_OS_NETBSD 1
+#endif
+
+#if (defined __OpenBSD__)
+#undef TF_OS_OPENBSD
+#define TF_OS_OPENBSD 1
+#endif
+
+#if (defined __bgq__)
+#undef TF_OS_CNK
+#define TF_OS_CNK 1
+#endif
+
+#if (defined __GNU__)
+#undef TF_OS_HURD
+#define TF_OS_HURD 1
+#endif
+
+#if (defined __sun)
+#undef TF_OS_SOLARIS
+#define TF_OS_SOLARIS 1
+#endif
+
+#if (1 !=                                                                  \
+     TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD +        \
+     TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_HURD +           \
+     TF_OS_SOLARIS)
+#define TF_OS_UNKNOWN 1
+#endif
+
+#if TF_OS_LINUX || TF_OS_DRAGONFLY || TF_OS_FREEBSD || TF_OS_NETBSD ||     \
+    TF_OS_OPENBSD || TF_OS_DARWIN || TF_OS_HURD || TF_OS_SOLARIS
+#undef TF_OS_UNIX
+#define TF_OS_UNIX 1
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Cache line alignment
+//-----------------------------------------------------------------------------
+#if defined(__i386__) || defined(__x86_64__)
+  #define TF_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+  // TODO
+  // This is the L1 D-cache line size of our Power7 machines.
+  // Need to check if this is appropriate for other PowerPC64 systems.
+  #define TF_CACHELINE_SIZE 128
+#elif defined(__arm__)
+  // Cache line sizes for ARM: These values are not strictly correct since
+  // cache line sizes depend on implementations, not architectures.
+  // There are even implementations with cache line sizes configurable
+  // at boot time.
+  #if defined(__ARM_ARCH_5T__)
+    #define TF_CACHELINE_SIZE 32
+  #elif defined(__ARM_ARCH_7A__)
+    #define TF_CACHELINE_SIZE 64
+  #endif
+#endif
+
+#ifndef TF_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+  #define TF_CACHELINE_SIZE 64
+#endif
+
+
+
+//-----------------------------------------------------------------------------
+// pause
+//-----------------------------------------------------------------------------
+//#if __has_include (<immintrin.h>)
+//  #define TF_HAS_MM_PAUSE 1
+//  #include <immintrin.h>
+//#endif
+
+namespace tf {
+
+// Struct: CachelineAligned
+// Due to prefetch, we typically do 2x cacheline for the alignment.
+template <typename T>
+struct CachelineAligned {
+  alignas (2*TF_CACHELINE_SIZE) T data;
+};
+
+// Function: get_env
+inline std::string get_env(const std::string& str) {
+#ifdef _MSC_VER
+  char *ptr = nullptr;
+  size_t len = 0;
+
+  if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
+    std::string res(ptr, len);
+    std::free(ptr);
+    return res;
+  }
+  return "";
+
+#else
+  auto ptr = std::getenv(str.c_str());
+  return ptr ? ptr : "";
+#endif
+}
+
+// Function: has_env
+inline bool has_env(const std::string& str) {
+#ifdef _MSC_VER
+  char *ptr = nullptr;
+  size_t len = 0;
+
+  if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
+    std::string res(ptr, len);
+    std::free(ptr);
+    return true;
+  }
+  return false;
+
+#else
+  auto ptr = std::getenv(str.c_str());
+  return ptr ? true : false;
+#endif
+}
+
+// Procedure: relax_cpu
+//inline void relax_cpu() {
+//#ifdef TF_HAS_MM_PAUSE
+//  _mm_pause();
+//#endif
+//}
+
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/utility/serializer.hpp b/myxpcs/include/taskflow_/utility/serializer.hpp
new file mode 100644
index 0000000..aab00f2
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/serializer.hpp
@@ -0,0 +1,1135 @@
+#pragma once
+
+#include <type_traits>
+#include <iterator>
+#include <iostream>
+#include <fstream>
+#include <stack>
+#include <queue>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <functional>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <sstream>
+#include <list>
+#include <forward_list>
+#include <numeric>
+#include <iomanip>
+#include <cassert>
+#include <cmath>
+#include <array>
+#include <string>
+#include <variant>
+#include <optional>
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Supported C++ STL type
+// ----------------------------------------------------------------------------
+
+// std::basic_string
+template <typename T>
+struct is_std_basic_string : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_basic_string <std::basic_string<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_basic_string_v = is_std_basic_string<T>::value;
+
+// std::array
+template <typename T>
+struct is_std_array : std::false_type {};
+
+template <typename T, size_t N>
+struct is_std_array <std::array<T, N>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_array_v = is_std_array<T>::value;
+
+// std::vector
+template <typename T>
+struct is_std_vector : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_vector <std::vector<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_vector_v = is_std_vector<T>::value;
+
+// std::deque
+template <typename T>
+struct is_std_deque : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_deque <std::deque<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_deque_v = is_std_deque<T>::value;
+
+// std::list
+template <typename T>
+struct is_std_list : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_list <std::list<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_list_v = is_std_list<T>::value;
+
+// std::forward_list
+template <typename T>
+struct is_std_forward_list : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_forward_list <std::forward_list<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_forward_list_v = is_std_forward_list<T>::value;
+
+// std::map
+template <typename T>
+struct is_std_map : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_map <std::map<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_map_v = is_std_map<T>::value;
+
+// std::unordered_map
+template <typename T>
+struct is_std_unordered_map : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_unordered_map <std::unordered_map<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_unordered_map_v = is_std_unordered_map<T>::value;
+
+// std::set
+template <typename T>
+struct is_std_set : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_set <std::set<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_set_v = is_std_set<T>::value;
+
+// std::unordered_set
+template <typename T>
+struct is_std_unordered_set : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_unordered_set <std::unordered_set<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_unordered_set_v = is_std_unordered_set<T>::value;
+
+// std::variant
+template <typename T>
+struct is_std_variant : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_variant <std::variant<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_variant_v = is_std_variant<T>::value;
+
+// std::optional
+template <typename T>
+struct is_std_optional : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_optional <std::optional<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_optional_v = is_std_optional<T>::value;
+
+// std::unique_ptr
+template <typename T>
+struct is_std_unique_ptr : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_unique_ptr <std::unique_ptr<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_unique_ptr_v = is_std_unique_ptr<T>::value;
+
+// std::shared_ptr
+template <typename T>
+struct is_std_shared_ptr : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_shared_ptr <std::shared_ptr<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_shared_ptr_v = is_std_shared_ptr<T>::value;
+
+// std::duration
+template <typename T> struct is_std_duration : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_duration<std::chrono::duration<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_duration_v = is_std_duration<T>::value;
+
+// std::time_point
+template <typename T>
+struct is_std_time_point : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_time_point<std::chrono::time_point<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_time_point_v = is_std_time_point<T>::value;
+
+// std::tuple
+template <typename T>
+struct is_std_tuple : std::false_type {};
+
+template <typename... ArgsT>
+struct is_std_tuple<std::tuple<ArgsT...>> : std::true_type {};
+
+template <typename T>
+constexpr bool is_std_tuple_v = is_std_tuple<T>::value;
+
+//-----------------------------------------------------------------------------
+// Type extraction.
+//-----------------------------------------------------------------------------
+
+// ExtractType: forward declaration
+template <size_t, typename>
+struct ExtractType;
+
+// ExtractType_t: alias interface
+template <size_t idx, typename C>
+using ExtractType_t = typename ExtractType<idx, C>::type;
+
+// ExtractType: base
+template <template <typename...> typename C, typename T, typename... RestT>
+struct ExtractType <0, C<T, RestT...>> {
+  using type = T;
+};
+
+// ExtractType: base
+template <typename T>
+struct ExtractType <0, T> {
+  using type = T;
+};
+
+// ExtractType: recursive definition.
+template <size_t idx, template <typename...> typename C, typename T, typename... RestT>
+struct ExtractType <idx, C<T, RestT...>> : ExtractType<idx-1, C<RestT...>> {
+};
+
+// ----------------------------------------------------------------------------
+// Size Wrapper
+// ----------------------------------------------------------------------------
+
+// Struct: SizeTag
+// Class that wraps a given size item which can be customized.
+template <typename T>
+class SizeTag {
+
+  public:
+
+    using type = std::conditional_t<std::is_lvalue_reference_v<T>, T, std::decay_t<T>>;
+
+    SizeTag(T&& item) : _item(std::forward<T>(item)) {}
+
+    SizeTag& operator = (const SizeTag&) = delete;
+
+    inline const T& get() const {return _item;}
+
+    template <typename ArchiverT>
+    auto save(ArchiverT & ar) const { return ar(_item); }
+
+    template <typename ArchiverT>
+    auto load(ArchiverT & ar) { return ar(_item); }
+
+  private:
+
+    type _item;
+};
+
+// Function: make_size_tag
+template <typename T>
+SizeTag<T> make_size_tag(T&& t) {
+  return { std::forward<T>(t) };
+}
+
+// ----------------------------------------------------------------------------
+// Size Wrapper
+// ----------------------------------------------------------------------------
+
+// Class: MapItem
+template <typename KeyT, typename ValueT>
+class MapItem {
+
+  public:
+
+    using KeyType = std::conditional_t <std::is_lvalue_reference_v<KeyT>, KeyT, std::decay_t<KeyT>>;
+    using ValueType = std::conditional_t <std::is_lvalue_reference_v<ValueT>, ValueT, std::decay_t<ValueT>>;
+
+    MapItem(KeyT&& k, ValueT&& v) : _key(std::forward<KeyT>(k)), _value(std::forward<ValueT>(v)) {}
+    MapItem& operator = (const MapItem&) = delete;
+
+    inline const KeyT& key() const { return _key; }
+    inline const ValueT& value() const { return _value; }
+
+    template <typename ArchiverT>
+    auto save(ArchiverT & ar) const { return ar(_key, _value); }
+
+    template <typename ArchiverT>
+    auto load(ArchiverT & ar) { return ar(_key, _value); }
+
+  private:
+
+    KeyType _key;
+    ValueType _value;
+};
+
+// Function: make_kv_pair
+template <typename KeyT, typename ValueT>
+MapItem<KeyT, ValueT> make_kv_pair(KeyT&& k, ValueT&& v) {
+  return { std::forward<KeyT>(k), std::forward<ValueT>(v) };
+}
+
+// ----------------------------------------------------------------------------
+// Serializer Definition
+// ----------------------------------------------------------------------------
+
+template <typename T>
+constexpr auto is_default_serializable_v = (
+  std::is_arithmetic_v<T>    ||
+  std::is_enum_v<T>          ||
+  is_std_basic_string_v<T>   ||
+  is_std_vector_v<T>         ||
+  is_std_deque_v<T>          ||
+  is_std_list_v<T>           ||
+  is_std_forward_list_v<T>   ||
+  is_std_map_v<T>            ||
+  is_std_unordered_map_v<T>  ||
+  is_std_set_v<T>            ||
+  is_std_unordered_set_v<T>  ||
+  is_std_duration_v<T>       ||
+  is_std_time_point_v<T>     ||
+  is_std_variant_v<T>        ||
+  is_std_optional_v<T>       ||
+  is_std_tuple_v<T>          ||
+  is_std_array_v<T>
+);
+
+
+// Class: Serializer
+template <typename Stream, typename SizeType = std::streamsize>
+class Serializer {
+
+  public:
+
+    Serializer(Stream& stream);
+
+    template <typename... T>
+    SizeType operator()(T&&... items);
+
+  private:
+
+    Stream& _stream;
+
+    template <typename T,
+      std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<
+        is_std_deque_v<std::decay_t<T>> ||
+        is_std_list_v<std::decay_t<T>>,
+        void
+      >* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<
+        is_std_forward_list_v<std::decay_t<T>>,
+        void
+      >* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<
+        is_std_map_v<std::decay_t<T>> ||
+        is_std_unordered_map_v<std::decay_t<T>>,
+        void
+      >* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<
+        is_std_set_v<std::decay_t<T>> ||
+        is_std_unordered_set_v<std::decay_t<T>>,
+        void
+      >* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+
+};
+
+// Constructor
+template <typename Stream, typename SizeType>
+Serializer<Stream, SizeType>::Serializer(Stream& stream) : _stream(stream) {
+}
+
+// Operator ()
+template <typename Stream, typename SizeType>
+template <typename... T>
+SizeType Serializer<Stream, SizeType>::operator() (T&&... items) {
+  return (_save(std::forward<T>(items)) + ...);
+}
+
+// arithmetic data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  _stream.write(reinterpret_cast<const char*>(std::addressof(t)), sizeof(t));
+  return sizeof(t);
+}
+
+// std::basic_string
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  using U = std::decay_t<T>;
+  auto sz = _save(make_size_tag(t.size()));
+  _stream.write(
+    reinterpret_cast<const char*>(t.data()),
+    t.size()*sizeof(typename U::value_type)
+  );
+  return sz + t.size()*sizeof(typename U::value_type);
+}
+
+// std::vector
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  auto sz = _save(make_size_tag(t.size()));
+
+  if constexpr (std::is_arithmetic_v<typename U::value_type>) {
+    _stream.write(
+      reinterpret_cast<const char*>(t.data()),
+      t.size() * sizeof(typename U::value_type)
+    );
+    sz += t.size() * sizeof(typename U::value_type);
+  } else {
+    for(auto&& item : t) {
+      sz += _save(item);
+    }
+  }
+
+  return sz;
+}
+
+// std::list and std::deque
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
+                   is_std_list_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  auto sz = _save(make_size_tag(t.size()));
+  for(auto&& item : t) {
+    sz += _save(item);
+  }
+  return sz;
+}
+
+// std::forward_list
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_forward_list_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  auto sz = _save(make_size_tag(std::distance(t.begin(), t.end())));
+  for(auto&& item : t) {
+    sz += _save(item);
+  }
+  return sz;
+}
+
+// std::map and std::unordered_map
+template <typename Stream, typename SizeType>
+template <typename T, std::enable_if_t<
+  is_std_map_v<std::decay_t<T>> ||
+  is_std_unordered_map_v<std::decay_t<T>>,
+  void
+>*>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  auto sz = _save(make_size_tag(t.size()));
+  for(auto&& [k, v] : t) {
+    sz += _save(make_kv_pair(k, v));
+  }
+  return sz;
+}
+
+// std::set and std::unordered_set
+template <typename Stream, typename SizeType>
+template <typename T, std::enable_if_t<
+  is_std_set_v<std::decay_t<T>> ||
+  is_std_unordered_set_v<std::decay_t<T>>,
+  void
+>*>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  auto sz = _save(make_size_tag(t.size()));
+  for(auto&& item : t) {
+    sz += _save(item);
+  }
+  return sz;
+}
+
+// enum data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  using U = std::decay_t<T>;
+  return _save(static_cast<std::underlying_type_t<U>>(t));
+}
+
+// duration data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return _save(t.count());
+}
+
+// time point data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return _save(t.time_since_epoch());
+}
+
+// optional data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  if(bool flag = t.has_value(); flag) {
+    return _save(flag) + _save(*t);
+  }
+  else {
+    return _save(flag);
+  }
+}
+
+// variant type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return _save(t.index()) +
+         std::visit([&] (auto&& arg){ return _save(arg);}, t);
+}
+
+// tuple type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return std::apply(
+    [&] (auto&&... args) {
+      return (_save(std::forward<decltype(args)>(args)) + ... + 0);
+    },
+    std::forward<T>(t)
+  );
+}
+
+// array
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero");
+
+  SizeType sz;
+
+  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
+    _stream.write(reinterpret_cast<const char*>(t.data()), sizeof(t));
+    sz = sizeof(t);
+  }
+  else {
+    sz = 0;
+    for(auto&& item : t) {
+      sz += _save(item);
+    }
+  }
+
+  return sz;
+}
+
+// custom save method
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>*
+>
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return t.save(*this);
+}
+
+// ----------------------------------------------------------------------------
+// DeSerializer Definition
+// ----------------------------------------------------------------------------
+
+template <typename T>
+constexpr auto is_default_deserializable_v =
+  std::is_arithmetic_v<T>    ||
+  std::is_enum_v<T>          ||
+  is_std_basic_string_v<T>   ||
+  is_std_vector_v<T>         ||
+  is_std_deque_v<T>          ||
+  is_std_list_v<T>           ||
+  is_std_forward_list_v<T>   ||
+  is_std_map_v<T>            ||
+  is_std_unordered_map_v<T>  ||
+  is_std_set_v<T>            ||
+  is_std_unordered_set_v<T>  ||
+  is_std_duration_v<T>       ||
+  is_std_time_point_v<T>     ||
+  is_std_variant_v<T>        ||
+  is_std_optional_v<T>       ||
+  is_std_tuple_v<T>          ||
+  is_std_array_v<T>;
+
+// Class: Deserializer
+template <typename Stream, typename SizeType = std::streamsize>
+class Deserializer {
+
+  public:
+
+    Deserializer(Stream& stream);
+
+    template <typename... T>
+    SizeType operator()(T&&... items);
+
+  private:
+
+    Stream& _stream;
+
+    // Function: _variant_helper
+    template <
+      size_t I = 0, typename... ArgsT,
+      std::enable_if_t<I==sizeof...(ArgsT)>* = nullptr
+    >
+    SizeType _variant_helper(size_t, std::variant<ArgsT...>&);
+
+    // Function: _variant_helper
+    template <
+      size_t I = 0, typename... ArgsT,
+      std::enable_if_t<I<sizeof...(ArgsT)>* = nullptr
+    >
+    SizeType _variant_helper(size_t, std::variant<ArgsT...>&);
+
+    template <typename T,
+      std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<
+        is_std_deque_v<std::decay_t<T>> ||
+        is_std_list_v<std::decay_t<T>>  ||
+        is_std_forward_list_v<std::decay_t<T>>,
+        void
+      >* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+
+    template <typename T,
+      std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _load(T&&);
+};
+
+// Constructor
+template <typename Stream, typename SizeType>
+Deserializer<Stream, SizeType>::Deserializer(Stream& stream) : _stream(stream) {
+}
+
+// Operator ()
+template <typename Stream, typename SizeType>
+template <typename... T>
+SizeType Deserializer<Stream, SizeType>::operator() (T&&... items) {
+  return (_load(std::forward<T>(items)) + ...);
+}
+
+// Function: _variant_helper
+template <typename Stream, typename SizeType>
+template <size_t I, typename... ArgsT, std::enable_if_t<I==sizeof...(ArgsT)>*>
+SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t, std::variant<ArgsT...>&) {
+  return 0;
+}
+
+// Function: _variant_helper
+template <typename Stream, typename SizeType>
+template <size_t I, typename... ArgsT, std::enable_if_t<I<sizeof...(ArgsT)>*>
+SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t i, std::variant<ArgsT...>& v) {
+  if(i == 0) {
+    using type = ExtractType_t<I, std::variant<ArgsT...>>;
+    if(v.index() != I) {
+      static_assert(
+        std::is_default_constructible<type>::value,
+        "Failed to archive variant (type should be default constructible T())"
+      );
+      v = type();
+    }
+    return _load(*std::get_if<type>(&v));
+  }
+  return _variant_helper<I+1, ArgsT...>(i-1, v);
+}
+
+// arithmetic data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  _stream.read(reinterpret_cast<char*>(std::addressof(t)), sizeof(t));
+  return sizeof(t);
+}
+
+// std::basic_string
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+  typename U::size_type num_chars;
+  auto sz = _load(make_size_tag(num_chars));
+  t.resize(num_chars);
+  _stream.read(reinterpret_cast<char*>(t.data()), num_chars*sizeof(typename U::value_type));
+  return sz + num_chars*sizeof(typename U::value_type);
+}
+
+// std::vector
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  typename U::size_type num_data;
+
+  auto sz = _load(make_size_tag(num_data));
+
+  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
+    t.resize(num_data);
+    _stream.read(reinterpret_cast<char*>(t.data()), num_data * sizeof(typename U::value_type));
+    sz += num_data * sizeof(typename U::value_type);
+  }
+  else {
+    t.resize(num_data);
+    for(auto && v : t) {
+      sz += _load(v);
+    }
+  }
+  return sz;
+}
+
+// std::list and std::deque
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
+                   is_std_list_v<std::decay_t<T>>  ||
+                   is_std_forward_list_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+
+  typename U::size_type num_data;
+  auto sz = _load(make_size_tag(num_data));
+
+  t.resize(num_data);
+  for(auto && v : t) {
+    sz += _load(v);
+  }
+  return sz;
+}
+
+// std::map
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  typename U::size_type num_data;
+  auto sz = _load(make_size_tag(num_data));
+
+  t.clear();
+  auto hint = t.begin();
+
+  typename U::key_type k;
+  typename U::mapped_type v;
+
+  for(size_t i=0; i<num_data; ++i) {
+    sz += _load(make_kv_pair(k, v));
+    hint = t.emplace_hint(hint, std::move(k), std::move(v));
+  }
+  return sz;
+}
+
+// std::unordered_map
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+  typename U::size_type num_data;
+  auto sz = _load(make_size_tag(num_data));
+
+  t.clear();
+  t.reserve(num_data);
+
+  typename U::key_type k;
+  typename U::mapped_type v;
+
+  for(size_t i=0; i<num_data; ++i) {
+    sz += _load(make_kv_pair(k, v));
+    t.emplace(std::move(k), std::move(v));
+  }
+
+  return sz;
+}
+
+// std::set
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  typename U::size_type num_data;
+  auto sz = _load(make_size_tag(num_data));
+
+  t.clear();
+  auto hint = t.begin();
+
+  typename U::key_type k;
+
+  for(size_t i=0; i<num_data; ++i) {
+    sz += _load(k);
+    hint = t.emplace_hint(hint, std::move(k));
+  }
+  return sz;
+}
+
+// std::unordered_set
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  typename U::size_type num_data;
+  auto sz = _load(make_size_tag(num_data));
+
+  t.clear();
+  t.reserve(num_data);
+
+  typename U::key_type k;
+
+  for(size_t i=0; i<num_data; ++i) {
+    sz += _load(k);
+    t.emplace(std::move(k));
+  }
+  return sz;
+}
+
+// enum data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+  std::underlying_type_t<U> k;
+  auto sz = _load(k);
+  t = static_cast<U>(k);
+  return sz;
+}
+
+// duration data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+  typename U::rep count;
+  auto s = _load(count);
+  t = U{count};
+  return s;
+}
+
+// time point data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  using U = std::decay_t<T>;
+  typename U::duration elapsed;
+  auto s = _load(elapsed);
+  t = U{elapsed};
+  return s;
+}
+
+// optional data type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  bool has_value;
+  auto s = _load(has_value);
+  if(has_value) {
+    if(!t) {
+      t = typename U::value_type();
+    }
+    s += _load(*t);
+  }
+  else {
+    t.reset();
+  }
+  return s;
+}
+
+// variant type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  std::decay_t<decltype(t.index())> idx;
+  auto s = _load(idx);
+  return s + _variant_helper(idx, t);
+}
+
+// tuple type
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  return std::apply(
+    [&] (auto&&... args) {
+      return (_load(std::forward<decltype(args)>(args)) + ... + 0);
+    },
+    std::forward<T>(t)
+  );
+}
+
+// array
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
+  using U = std::decay_t<T>;
+
+  static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero");
+
+  SizeType sz;
+
+  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
+    _stream.read(reinterpret_cast<char*>(t.data()), sizeof(t));
+    sz = sizeof(t);
+  }
+  else {
+    sz = 0;
+    for(auto && v : t) {
+      sz += _load(v);
+    }
+  }
+
+  return sz;
+}
+
+// custom save method
+template <typename Stream, typename SizeType>
+template <typename T,
+  std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>*
+>
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  return t.load(*this);
+}
+
+}  // ned of namespace tf -----------------------------------------------------
+
+
+
+
+
+
diff --git a/myxpcs/include/taskflow_/utility/singleton.hpp b/myxpcs/include/taskflow_/utility/singleton.hpp
new file mode 100644
index 0000000..aab50bc
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/singleton.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+namespace tf {
+
+/** @class Singleton
+
+@brief class template to create a thread-safe singleton object
+
+*/
+template <typename T>
+class Singleton {
+
+  public:
+
+  /**
+  @brief get a reference to the singleton object
+  */
+  inline static T& get() {
+    static T instance;
+    return instance;
+  }
+
+  private:
+
+    Singleton() = default;
+    ~Singleton() = default;
+    Singleton(const Singleton&)= delete;
+    Singleton& operator=(const Singleton&)= delete;
+};
+
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/myxpcs/include/taskflow_/utility/small_vector.hpp b/myxpcs/include/taskflow_/utility/small_vector.hpp
new file mode 100644
index 0000000..a42c264
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/small_vector.hpp
@@ -0,0 +1,1048 @@
+// small vector modified from llvm
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+
+#if defined(__GNUC__)
+  #define TF_LIKELY(x) (__builtin_expect((x), 1))
+  #define TF_UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+  #define TF_LIKELY(x) (x)
+  #define TF_UNLIKELY(x) (x)
+#endif
+
+/**
+@file small_vector.hpp
+@brief small vector include file
+*/
+
+namespace tf { namespace detail {
+
+/**
+@private
+@brief NextCapacity - Returns the next power of two (in 64-bits)
+       that is strictly greater than A.  Returns zero on overflow.
+       this function assumes A to be positive
+*/
+inline uint64_t NextCapacity(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+}}  // end of namespace tf::detail --------------------------------------------
+
+
+namespace tf {
+
+/**
+@private
+*/
+template <typename T>
+struct IsPod : std::integral_constant<bool, std::is_standard_layout<T>::value &&
+                                            std::is_trivial<T>::value> {};
+
+/**
+@private
+*/
+class SmallVectorBase {
+protected:
+  void *BeginX, *EndX, *CapacityX;
+
+protected:
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize){
+    size_t CurSizeBytes = size_in_bytes();
+    size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
+    if (NewCapacityInBytes < MinSizeInBytes) {
+      NewCapacityInBytes = MinSizeInBytes;
+    }
+
+    void *NewElts;
+    if (BeginX == FirstEl) {
+      NewElts = std::malloc(NewCapacityInBytes);
+
+      // Copy the elements over.  No need to run dtors on PODs.
+      memcpy(NewElts, this->BeginX, CurSizeBytes);
+    } else {
+      // If this wasn't grown from the inline copy, grow the allocated space.
+      NewElts = realloc(this->BeginX, NewCapacityInBytes);
+    }
+    //assert(NewElts && "Out of memory");
+
+    this->EndX = (char*)NewElts+CurSizeBytes;
+    this->BeginX = NewElts;
+    this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+  }
+
+public:
+  /// This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const { return BeginX == EndX; }
+};
+
+/**
+@private
+*/
+template <typename T, unsigned N> struct SmallVectorStorage;
+
+/**
+@private
+*/
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+
+  private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
+  template <typename X>
+  struct AlignedUnionType {
+    alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))];
+  };
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  
+  // deprecated in c++23
+  //typedef typename std::aligned_union<1, T>::type U;
+  typedef AlignedUnionType<T> U;
+
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+  protected:
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
+  void setEnd(T *P) { this->EndX = P; }
+
+  public:
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef T *iterator;
+  typedef const T *const_iterator;
+
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  typedef T &reference;
+  typedef const T &const_reference;
+  typedef T *pointer;
+  typedef const T *const_pointer;
+
+  // forward iterator creation methods.
+  inline iterator begin() { return (iterator)this->BeginX; }
+  inline const_iterator begin() const { return (const_iterator)this->BeginX; }
+  inline iterator end() { return (iterator)this->EndX; }
+  inline const_iterator end() const { return (const_iterator)this->EndX; }
+
+  protected:
+
+  iterator capacity_ptr() { return (iterator)this->CapacityX; }
+  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+
+  public:
+
+  // reverse iterator creation methods.
+  reverse_iterator rbegin()            { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+  reverse_iterator rend()              { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+  inline size_type size() const { return end()-begin(); }
+  inline size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+  /// Return the total number of elements in the currently allocated buffer.
+  size_t capacity() const { return capacity_ptr() - begin(); }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() { return pointer(begin()); }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const { return const_pointer(begin()); }
+
+  inline reference operator[](size_type idx) {
+    //assert(idx < size());
+    return begin()[idx];
+  }
+
+  inline const_reference operator[](size_type idx) const {
+    //assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    //assert(!empty());
+    return begin()[0];
+  }
+
+  const_reference front() const {
+    //assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    //assert(!empty());
+    return end()[-1];
+  }
+
+  const_reference back() const {
+    //assert(!empty());
+    return end()[-1];
+  }
+};
+
+/**
+@private
+*/
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T *S, T *E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(std::make_move_iterator(I),
+                            std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+public:
+  void push_back(const T &Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(Elt);
+    this->setEnd(this->end()+1);
+  }
+
+  void push_back(T &&Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(::std::move(Elt));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+    this->end()->~T();
+  }
+};
+
+/**
+@private
+*/
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+  size_t CurCapacity = this->capacity();
+  size_t CurSize = this->size();
+  // Always grow, even from zero.
+  size_t NewCapacity = size_t(tf::detail::NextCapacity(CurCapacity+2));
+  if (NewCapacity < MinSize)
+    NewCapacity = MinSize;
+  T *NewElts = static_cast<T*>(std::malloc(NewCapacity*sizeof(T)));
+
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    std::free(this->begin());
+
+  this->setEnd(NewElts+CurSize);
+  this->BeginX = NewElts;
+  this->CapacityX = this->begin()+NewCapacity;
+}
+
+/**
+@private
+*/
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T *, T *) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1 *I, T1 *E, T2 *Dest,
+      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+                                           T2>::value>::type * = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize*sizeof(T), sizeof(T));
+  }
+public:
+  void push_back(const T &Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+  }
+};
+
+/**
+@private
+*/
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T, IsPod<T>::value> {
+  typedef SmallVectorTemplateBase<T, IsPod<T>::value> SuperClass;
+
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
+
+public:
+  typedef typename SuperClass::iterator iterator;
+  typedef typename SuperClass::const_iterator const_iterator;
+  typedef typename SuperClass::size_type size_type;
+
+protected:
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N)
+    : SmallVectorTemplateBase<T, IsPod<T>::value>(N*sizeof(T)) {
+  }
+
+public:
+  ~SmallVectorImpl() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      std::free(this->begin());
+  }
+
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->EndX = this->BeginX;
+  }
+
+  void resize(size_type N) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void resize(size_type N, const T &NV) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      std::uninitialized_fill(this->end(), this->begin()+N, NV);
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl &RHS);
+
+  /// Add the specified range to the end of the SmallVector.
+  template<typename in_iter>
+  void append(in_iter in_start, in_iter in_end) {
+    size_type NumInputs = std::distance(in_start, in_end);
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  /// Add the specified range to the end of the SmallVector.
+  void append(size_type NumInputs, const T &Elt) {
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  void assign(size_type NumElts, const T &Elt) {
+    clear();
+    if (this->capacity() < NumElts)
+      this->grow(NumElts);
+    this->setEnd(this->begin()+NumElts);
+    std::uninitialized_fill(this->begin(), this->end(), Elt);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    //assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    //assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I+1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return(N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    //assert(S >= this->begin() && "Range to erase is out of bounds.");
+    //assert(S <= E && "Trying to erase invalid range.");
+    //assert(E <= this->end() && "Trying to erase past the end.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->setEnd(I);
+    return(N);
+  }
+
+  iterator insert(iterator I, T &&Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end()-1;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+
+    ::new ((void*) this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
+  }
+
+  iterator insert(iterator I, const T &Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(Elt);
+      return this->end()-1;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+    ::new ((void*) this->end()) T(std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin()+InsertElt;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::fill_n(I, NumToInsert, Elt);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, Elt);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+    return I;
+  }
+
+  template<typename ItTy>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(From, To);
+      return this->begin()+InsertElt;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J; ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
+  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+
+  bool operator==(const SmallVectorImpl &RHS) const {
+    if (this->size() != RHS.size()) return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl &RHS) const {
+    return std::lexicographical_compare(this->begin(), this->end(),
+                                        RHS.begin(), RHS.end());
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_type N) {
+    //assert(N <= this->capacity());
+    this->setEnd(this->begin() + N);
+  }
+};
+
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->EndX, RHS.EndX);
+    std::swap(this->CapacityX, RHS.CapacityX);
+    return;
+  }
+  if (RHS.size() > this->capacity())
+    this->grow(RHS.size());
+  if (this->size() > RHS.capacity())
+    RHS.grow(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size()) NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
+    RHS.setEnd(RHS.end()+EltDiff);
+    this->destroy_range(this->begin()+NumShared, this->end());
+    this->setEnd(this->begin()+NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
+    this->setEnd(this->end() + EltDiff);
+    this->destroy_range(RHS.begin()+NumShared, RHS.end());
+    RHS.setEnd(RHS.begin()+NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::
+  operator=(const SmallVectorImpl<T> &RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->setEnd(NewEnd);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall()) std::free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/**
+@private
+*/
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  /**
+  @private
+  */
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+
+/**
+@private
+*/
+template <typename T> struct SmallVectorStorage<T, 1> {};
+
+/**
+@private
+*/
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/**
+@brief class to define a vector optimized for small array
+
+@tparam T data type
+@tparam N threshold of the number of elements in the initial storage
+
+The class defines a C++ STL-styled vector (a variable-sized array)
+optimized for the case when the array is small.
+It contains some number of elements in-place,
+which allows it to avoid heap allocation when the actual number of
+elements is below that threshold. This allows normal @em small cases to be
+fast without losing generality for large inputs.
+All the methods in [std::vector](https://en.cppreference.com/w/cpp/container/vector)
+can apply to this class.
+
+The class is stripped from the LLVM codebase.
+*/
+template <typename T, unsigned N = 2>
+class SmallVector : public SmallVectorImpl<T> {
+  /// Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
+
+public:
+
+  /**
+  @brief constructs an empty vector
+  */
+  SmallVector() : SmallVectorImpl<T>(N) {
+  }
+
+  /**
+  @brief constructs a vector with @c Size copies of elements with value @c value
+  */
+  explicit SmallVector(size_t Size, const T &Value = T())
+    : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  /**
+  @brief constructs a vector with the contents of the range
+         <tt>[S, E)</tt>
+   */
+  template<typename ItTy>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  //template <typename RangeTy>
+  //explicit SmallVector(const tf::iterator_range<RangeTy> &R)
+  //    : SmallVectorImpl<T>(N) {
+  //  this->append(R.begin(), R.end());
+  //}
+
+  /**
+  @brief constructs a vector with the contents of the initializer list @c IL
+  */
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  /**
+  @brief constructs the vector with the copy of the contents of @c RHS
+  */
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  /**
+  @brief constructs the vector with the contents of @c RHS using move semantics
+  */
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  /**
+  @brief replaces the contents with a copy of the contents of @c RHS
+  */
+  const SmallVector &operator=(const SmallVector &RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  /**
+  @brief replaces the contents with the contents of @c RHS using move semantics
+  */
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  /**
+  @brief constructs a vector with the contents of @c RHS using move semantics
+  */
+  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  /**
+  @brief replaces the contents with the contents of @c RHS using move semantics
+   */
+  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  /**
+  @brief replaces the contents with the copy of the contents of an initializer list @c IL
+   */
+  const SmallVector &operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template<typename T, unsigned N>
+static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+  return X.capacity_in_bytes();
+}
+
+} // end tf namespace ---------------------------------------------------------
+
+namespace std {
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T>
+  inline void
+  swap(tf::SmallVectorImpl<T> &LHS, tf::SmallVectorImpl<T> &RHS) {
+    LHS.swap(RHS);
+  }
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T, unsigned N>
+  inline void
+  swap(tf::SmallVector<T, N> &LHS, tf::SmallVector<T, N> &RHS) {
+    LHS.swap(RHS);
+  }
+}  // end of namespace std ----------------------------------------------------
+
+
diff --git a/myxpcs/include/taskflow_/utility/stream.hpp b/myxpcs/include/taskflow_/utility/stream.hpp
new file mode 100644
index 0000000..34a86ff
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/stream.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace tf {
+
+// Procedure: ostreamize
+template <typename T>
+void ostreamize(std::ostream& os, T&& token) {
+  os << std::forward<T>(token);
+}
+
+// Procedure: ostreamize
+template <typename T, typename... Rest>
+void ostreamize(std::ostream& os, T&& token, Rest&&... rest) {
+  os << std::forward<T>(token);
+  ostreamize(os, std::forward<Rest>(rest)...);
+}
+
+// Function: stringify
+template <typename... ArgsT>
+std::string stringify(ArgsT&&... args) {
+  std::ostringstream oss;
+  ostreamize(oss, std::forward<ArgsT>(args)...);
+  return oss.str();
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/myxpcs/include/taskflow_/utility/traits.hpp b/myxpcs/include/taskflow_/utility/traits.hpp
new file mode 100644
index 0000000..dd3953b
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/traits.hpp
@@ -0,0 +1,303 @@
+#pragma once
+
+#if __has_include(<version>)
+#  include <version>
+#endif
+
+#include <type_traits>
+#include <iterator>
+#include <iostream>
+#include <fstream>
+#include <mutex>
+#include <stack>
+#include <queue>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <atomic>
+#include <thread>
+#include <future>
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <sstream>
+#include <list>
+#include <numeric>
+#include <random>
+#include <iomanip>
+#include <cassert>
+#include <cmath>
+#include <array>
+#include <string>
+#include <variant>
+#include <optional>
+#include "os.hpp"
+
+namespace tf {
+
+//-----------------------------------------------------------------------------
+// Traits
+//-----------------------------------------------------------------------------
+
+//// Struct: dependent_false
+//template <typename... T>
+//struct dependent_false {
+//  static constexpr bool value = false;
+//};
+//
+//template <typename... T>
+//constexpr auto dependent_false_v = dependent_false<T...>::value;
+
+template<typename> inline constexpr bool dependent_false_v = false;
+
+// ----------------------------------------------------------------------------
+// is_pod
+//-----------------------------------------------------------------------------
+template <typename T>
+struct is_pod {
+  static const bool value = std::is_trivial_v<T> && 
+                            std::is_standard_layout_v<T>;
+};
+
+template <typename T>
+constexpr bool is_pod_v = is_pod<T>::value;
+
+//-----------------------------------------------------------------------------
+// NoInit
+//-----------------------------------------------------------------------------
+
+template <typename T>
+struct NoInit {
+
+  //static_assert(is_pod_v<T>, "NoInit only supports POD type");
+
+  // constructor without initialization
+  NoInit () noexcept {}
+
+  // implicit conversion T -> NoInit<T>
+  constexpr  NoInit (T value) noexcept : v{value} {}
+
+  // implicit conversion NoInit<T> -> T
+  constexpr  operator T () const noexcept { return v; }
+
+  T v;
+};
+
+//-----------------------------------------------------------------------------
+// Move-On-Copy
+//-----------------------------------------------------------------------------
+
+// Struct: MoveOnCopyWrapper
+template <typename T>
+struct MoC {
+
+  MoC(T&& rhs) : object(std::move(rhs)) {}
+  MoC(const MoC& other) : object(std::move(other.object)) {}
+
+  T& get() { return object; }
+
+  mutable T object;
+};
+
+template <typename T>
+auto make_moc(T&& m) {
+  return MoC<T>(std::forward<T>(m));
+}
+
+//-----------------------------------------------------------------------------
+// Visitors.
+//-----------------------------------------------------------------------------
+
+//// Overloadded.
+//template <typename... Ts>
+//struct Visitors : Ts... {
+//  using Ts::operator()... ;
+//};
+//
+//template <typename... Ts>
+//Visitors(Ts...) -> Visitors<Ts...>;
+
+// ----------------------------------------------------------------------------
+// std::variant
+// ----------------------------------------------------------------------------
+template <typename T, typename>
+struct get_index;
+
+template <size_t I, typename... Ts>
+struct get_index_impl {};
+
+template <size_t I, typename T, typename... Ts>
+struct get_index_impl<I, T, T, Ts...> : std::integral_constant<size_t, I>{};
+
+template <size_t I, typename T, typename U, typename... Ts>
+struct get_index_impl<I, T, U, Ts...> : get_index_impl<I+1, T, Ts...>{};
+
+template <typename T, typename... Ts>
+struct get_index<T, std::variant<Ts...>> : get_index_impl<0, T, Ts...>{};
+
+template <typename T, typename... Ts>
+constexpr auto get_index_v = get_index<T, Ts...>::value;
+
+// ----------------------------------------------------------------------------
+// unwrap_reference
+// ----------------------------------------------------------------------------
+
+template <class T>
+struct unwrap_reference { using type = T; };
+
+template <class U>
+struct unwrap_reference<std::reference_wrapper<U>> { using type = U&; };
+
+template<class T>
+using unwrap_reference_t = typename unwrap_reference<T>::type;
+
+template< class T >
+struct unwrap_ref_decay : unwrap_reference<std::decay_t<T>> {};
+
+template<class T>
+using unwrap_ref_decay_t = typename unwrap_ref_decay<T>::type;
+
+// ----------------------------------------------------------------------------
+// stateful iterators
+// ----------------------------------------------------------------------------
+
+// STL-styled iterator
+template <typename B, typename E>
+struct stateful_iterator {
+
+  using TB = std::decay_t<unwrap_ref_decay_t<B>>;
+  using TE = std::decay_t<unwrap_ref_decay_t<E>>;
+
+  static_assert(std::is_same_v<TB, TE>, "decayed iterator types must match");
+
+  using type = TB;
+};
+
+template <typename B, typename E>
+using stateful_iterator_t = typename stateful_iterator<B, E>::type;
+
+// raw integral index
+template <typename B, typename E, typename S>
+struct stateful_index {
+
+  using TB = std::decay_t<unwrap_ref_decay_t<B>>;
+  using TE = std::decay_t<unwrap_ref_decay_t<E>>;
+  using TS = std::decay_t<unwrap_ref_decay_t<S>>;
+
+  static_assert(
+    std::is_integral_v<TB>, "decayed beg index must be an integral type"
+  );
+
+  static_assert(
+    std::is_integral_v<TE>, "decayed end index must be an integral type"
+  );
+
+  static_assert(
+    std::is_integral_v<TS>, "decayed step must be an integral type"
+  );
+
+  static_assert(
+    std::is_same_v<TB, TE> && std::is_same_v<TE, TS>,
+    "decayed index and step types must match"
+  );
+
+  using type = TB;
+};
+
+template <typename B, typename E, typename S>
+using stateful_index_t = typename stateful_index<B, E, S>::type;
+
+// ----------------------------------------------------------------------------
+// visit a tuple with a functor at runtime
+// ----------------------------------------------------------------------------
+
+template <typename Func, typename Tuple, size_t N = 0>
+void visit_tuple(Func func, Tuple& tup, size_t idx) {
+  if (N == idx) {
+    std::invoke(func, std::get<N>(tup));
+    return;
+  }
+  if constexpr (N + 1 < std::tuple_size_v<Tuple>) {
+    return visit_tuple<Func, Tuple, N + 1>(func, tup, idx);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// unroll loop
+// ----------------------------------------------------------------------------
+
+// Template unrolled looping construct.
+template<auto beg, auto end, auto step, bool valid = (beg < end)>
+struct Unroll {
+  template<typename F>
+  static void eval(F f) {
+    f(beg);
+    Unroll<beg + step, end, step>::eval(f);
+  }
+};
+
+template<auto beg, auto end, auto step>
+struct Unroll<beg, end, step, false> {
+  template<typename F>
+  static void eval(F) { }
+};
+
+template<auto beg, auto end, auto step, typename F>
+void unroll(F f) {
+  Unroll<beg, end, step>::eval(f);
+}
+
+// ----------------------------------------------------------------------------
+// make types of variant unique
+// ----------------------------------------------------------------------------
+
+template <typename T, typename... Ts>
+struct filter_duplicates { using type = T; };
+
+template <template <typename...> class C, typename... Ts, typename U, typename... Us>
+struct filter_duplicates<C<Ts...>, U, Us...>
+    : std::conditional_t<(std::is_same_v<U, Ts> || ...)
+                       , filter_duplicates<C<Ts...>, Us...>
+                       , filter_duplicates<C<Ts..., U>, Us...>> {};
+
+template <typename T>
+struct unique_variant;
+
+template <typename... Ts>
+struct unique_variant<std::variant<Ts...>> : filter_duplicates<std::variant<>, Ts...> {};
+
+template <typename T>
+using unique_variant_t = typename unique_variant<T>::type;
+
+
+// ----------------------------------------------------------------------------
+// check if it is default compare
+// ----------------------------------------------------------------------------
+template <typename T> struct is_std_compare : std::false_type { };
+template <typename T> struct is_std_compare<std::less<T>> : std::true_type { };
+template <typename T> struct is_std_compare<std::greater<T>> : std::true_type { };
+
+template <typename T>
+constexpr static bool is_std_compare_v = is_std_compare<T>::value;
+
+// ----------------------------------------------------------------------------
+// check if all types are the same
+// ----------------------------------------------------------------------------
+
+template<bool...> 
+struct bool_pack;
+
+template<bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template <typename T, typename... Ts>
+using all_same = all_true<std::is_same_v<T, Ts>...>;
+
+template <typename T, typename... Ts>
+constexpr bool all_same_v = all_same<T, Ts...>::value;
+
+
+}  // end of namespace tf. ----------------------------------------------------
+
+
+
diff --git a/myxpcs/include/taskflow_/utility/uuid.hpp b/myxpcs/include/taskflow_/utility/uuid.hpp
new file mode 100644
index 0000000..11d7f3b
--- /dev/null
+++ b/myxpcs/include/taskflow_/utility/uuid.hpp
@@ -0,0 +1,235 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <limits>
+#include <random>
+#include <chrono>
+
+namespace tf {
+
+// Class: UUID
+//
+// A universally unique identifier (UUID) is an identifier standard used in software
+// construction. A UUID is simply a 128-bit value. The meaning of each bit is defined
+// by any of several variants.
+// For human-readable display, many systems use a canonical format using hexadecimal
+// text with inserted hyphen characters.
+//
+// For example: 123e4567-e89b-12d3-a456-426655440000
+//
+// The intent of UUIDs is to enable distributed systems to uniquely identify information
+// without significant central coordination.
+//
+//   Copyright 2006 Andy Tompkins.
+//   Distributed under the Boost Software License, Version 1.0. (See
+//   accompanying file LICENSE_1_0.txt or copy at
+//   http://www.boost.org/LICENSE_1_0.txt)
+//
+struct UUID {
+
+  using value_type      = uint8_t;
+  using reference       = uint8_t&;
+  using const_reference = const uint8_t&;
+  using iterator        = uint8_t*;
+  using const_iterator  = const uint8_t*;
+  using size_type       = size_t;
+  using difference_type = ptrdiff_t;
+
+  inline UUID();
+
+  UUID(const UUID&) = default;
+  UUID(UUID&&) = default;
+
+  UUID& operator = (const UUID&) = default;
+  UUID& operator = (UUID&&) = default;
+
+  inline static size_type size();
+  inline iterator begin();
+  inline const_iterator begin() const;
+  inline iterator end();
+  inline const_iterator end() const;
+
+  inline bool is_nil() const;
+  inline void swap(UUID& rhs);
+  inline size_t hash_value() const;
+
+  inline bool operator == (const UUID&) const;
+  inline bool operator <  (const UUID&) const;
+  inline bool operator >  (const UUID&) const;
+  inline bool operator != (const UUID&) const;
+  inline bool operator >= (const UUID&) const;
+  inline bool operator <= (const UUID&) const;
+
+  uint8_t data[16] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  inline std::string to_string() const;
+};
+
+// Constructor
+inline UUID::UUID() {
+
+  static thread_local std::default_random_engine engine {
+    std::random_device{}()
+  };
+
+  std::uniform_int_distribution<unsigned long> distribution(
+    (std::numeric_limits<unsigned long>::min)(),
+    (std::numeric_limits<unsigned long>::max)()
+  );
+
+  int i = 0;
+  auto random_value = distribution(engine);
+  for (auto it=begin(); it!=end(); ++it, ++i) {
+    if (i == sizeof(unsigned long)) {
+      random_value = distribution(engine);
+      i = 0;
+    }
+    *it = static_cast<UUID::value_type>((random_value >> (i*8)) & 0xFF);
+  }
+
+  // set variant: must be 0b10xxxxxx
+  *(begin()+8) &= 0xBF;
+  *(begin()+8) |= 0x80;
+
+  // set version: must be 0b0100xxxx
+  *(begin()+6) &= 0x4F; //0b01001111
+  *(begin()+6) |= 0x40; //0b01000000
+}
+
+// Function: size
+inline typename UUID::size_type UUID::size() {
+  return 16;
+}
+
+// Function: begin
+inline typename UUID::iterator UUID::begin() {
+  return data;
+}
+
+// Function: begin
+inline typename UUID::const_iterator UUID::begin() const {
+  return data;
+}
+
+// Function: end
+inline typename UUID::iterator UUID::end() {
+  return data+size();
+}
+
+// Function: end
+inline typename UUID::const_iterator UUID::end() const {
+  return data+size();
+}
+
+// Function: is_nil
+inline bool UUID::is_nil() const {
+  for (std::size_t i = 0; i < sizeof(this->data); ++i) {
+    if (this->data[i] != 0U) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Procedure: swap
+inline void UUID::swap(UUID& rhs) {
+  UUID tmp = *this;
+  *this = rhs;
+  rhs = tmp;
+}
+
+// Function: hash_value
+inline size_t UUID::hash_value() const {
+  size_t seed = 0;
+  for(auto i=begin(); i != end(); ++i) {
+    seed ^= static_cast<size_t>(*i) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+  return seed;
+}
+
+// Operator: ==
+inline bool UUID::operator == (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) == 0;
+}
+
+// Operator: !=
+inline bool UUID::operator != (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) != 0;
+}
+
+// Operator: <
+inline bool UUID::operator < (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) < 0;
+}
+
+// Operator: >
+inline bool UUID::operator > (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) > 0;
+}
+
+// Operator: <=
+inline bool UUID::operator <= (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) <= 0;
+}
+
+// Operator: >=
+inline bool UUID::operator >= (const UUID& rhs) const {
+  return std::memcmp(data, rhs.data, sizeof(data)) >= 0;
+}
+
+// Function: to_string
+inline std::string UUID::to_string() const {
+
+  auto to_char = [](size_t i) {
+    if (i <= 9) return static_cast<char>('0' + i);
+    return static_cast<char>('a' + (i-10));
+  };
+
+  std::string result;
+  result.reserve(36);
+
+  std::size_t i=0;
+  for (auto it = begin(); it!=end(); ++it, ++i) {
+
+    const size_t hi = ((*it) >> 4) & 0x0F;
+    result += to_char(hi);
+
+    const size_t lo = (*it) & 0x0F;
+    result += to_char(lo);
+
+    if (i == 3 || i == 5 || i == 7 || i == 9) {
+      result += '-';
+    }
+  }
+  return result;
+}
+
+// Procedure: swap
+inline void swap(UUID& lhs, UUID& rhs) {
+  lhs.swap(rhs);
+}
+
+// ostream
+inline std::ostream& operator << (std::ostream& os, const UUID& rhs) {
+  os << rhs.to_string();
+  return os;
+}
+
+}  // End of namespace tf. ----------------------------------------------------
+
+//-----------------------------------------------------------------------------
+
+namespace std {
+
+// Partial specialization: hash<tf::UUID>
+template <>
+struct hash<tf::UUID> {
+  size_t operator()(const tf::UUID& rhs) const { return rhs.hash_value(); }
+};
+
+
+}  // End of namespace std. ---------------------------------------------------
+
+
diff --git a/myxpcs/source/function_call.pyx b/myxpcs/source/function_call.pyx
new file mode 100644
index 0000000..93900e7
--- /dev/null
+++ b/myxpcs/source/function_call.pyx
@@ -0,0 +1,69 @@
+import numpy as np
+cimport numpy as np  # for np.ndarray
+
+# Numpy must be initialized. When using numpy from C or Cython you must
+# _always_ do that, or you will have segfaults
+np.import_array()
+
+
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+
+cdef extern from "set_integer.h":
+    void computeXPCS(float*&, float*&)
+
+
+#    cdef int x
+#    set_integer_ref(x)
+#    return x
+#
+#
+#    cdef int[1] x
+#    set_integer_ptr(x)
+#    return x[0]
+#
+#
+#    cdef np.ndarray[int, ndim=1, mode='c'] x
+#
+#    x = np.zeros((1,), dtype=np.int32)
+#    set_integer_ptr(&x[0])
+#    return x[0]
+#
+#
+#    cdef int* x
+#    set_integer_ref_ptr(x)
+#    return x[0]
+#
+#
+#    cdef int* x
+#    set_integer_ptr_ptr(&x)
+#    return x[0]
+#
+#
+#    cdef np.ndarray[int, ndim=1, mode='c'] a
+#
+#    a = np.zeros((4,), dtype=np.int32)
+#    set_integer_arr_ptr(&a[0])
+#    return a
+
+
+cpdef doXPCS(np.ndarray[np.float32_t, ndim=3] in_ptr, np.ndarray[np.float32_t, ndim=2] out_ptr):
+    #cdef:
+    #    float* in_ptr
+    #    float* out_ptr
+    #    np.npy_intp shape[2]
+
+    computeXPCS(&in_ptr[0,0,0], &out_ptr[0,0])
+
+    # 1. Make sure that you have called np.import_array()
+    # http://gael-varoquaux.info/programming/
+    # cython-example-of-exposing-c-computed-arrays-in-python-without-data-copies.html
+    # 2. OWNDATA flag is important. It tells the NumPy to free data when the python object is deleted.
+    # https://stackoverflow.com/questions/23872946/force-numpy-ndarray-to-take-ownership-of-its-memory-in-cython/
+    # You can verify that the memory gets freed when Python object is deleted by using tools such as pmap.
+    #shape[0] = <np.npy_intp>(2)
+    #shape[1] = <np.npy_intp>(2)
+    
+    #cdef np.ndarray[float, ndim=2] a = np.PyArray_SimpleNewFromData(2, shape, np.NPY_FLOAT, out_ptr)
+    #PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+    return 1
\ No newline at end of file
diff --git a/myxpcs/source/set_integer.cpp b/myxpcs/source/set_integer.cpp
new file mode 100644
index 0000000..948bd28
--- /dev/null
+++ b/myxpcs/source/set_integer.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <memory>
+
+#include <data.h>
+#include <set_integer.h>
+
+// taskflow
+#include <taskflow_/taskflow.hpp>
+
+
+void computeXPCS(float* in, float* out)
+{
+    tf::Executor executor;
+
+
+    //const auto dims = (*in).shape;
+    const std::size_t fs = 10;//dims[0];
+    const std::size_t ss = 10;// dims[1];
+    const std::size_t memoryCells = 55;//dims[2];
+
+    auto mem = std::make_shared<Storage<float>>(std::vector<std::size_t>{fs, ss, memoryCells});
+    std::cout << "blub";
+    std::shared_ptr<Storage<float>> data = TranposeFromImageToTime_v3_block_tf_no_struct_one_taskflow<float>(mem, 3, 3, executor);
+    std::cout << "blib";
+    std::cout << in[0] << "\n";
+    out[0] = 1;
+    out[1] = 2;
+    out[2] = 3;
+    out[3] = 4;
+}
\ No newline at end of file
-- 
GitLab