From ca7b2df90031b804ae73325c2440fdb8a645e5fa Mon Sep 17 00:00:00 2001 From: Mads Jakobsen <mads.jakobsen@xfel.eu> Date: Thu, 25 Apr 2024 14:34:46 +0200 Subject: [PATCH] added myxpcs test package --- myxpcs/include/data.h | 185 ++ myxpcs/include/set_integer.h | 1 + .../include/taskflow_/algorithm/critical.hpp | 78 + .../taskflow_/algorithm/data_pipeline.hpp | 637 +++++ myxpcs/include/taskflow_/algorithm/find.hpp | 551 ++++ .../include/taskflow_/algorithm/for_each.hpp | 171 ++ myxpcs/include/taskflow_/algorithm/launch.hpp | 58 + .../taskflow_/algorithm/partitioner.hpp | 543 ++++ .../include/taskflow_/algorithm/pipeline.hpp | 1663 ++++++++++++ myxpcs/include/taskflow_/algorithm/reduce.hpp | 443 +++ myxpcs/include/taskflow_/algorithm/scan.hpp | 617 +++++ myxpcs/include/taskflow_/algorithm/sort.hpp | 661 +++++ .../include/taskflow_/algorithm/transform.hpp | 199 ++ myxpcs/include/taskflow_/core/async.hpp | 330 +++ myxpcs/include/taskflow_/core/async_task.hpp | 209 ++ .../include/taskflow_/core/declarations.hpp | 60 + myxpcs/include/taskflow_/core/environment.hpp | 8 + myxpcs/include/taskflow_/core/error.hpp | 26 + .../taskflow_/core/executor-module-opt.hpp | 2025 ++++++++++++++ myxpcs/include/taskflow_/core/executor.hpp | 2385 +++++++++++++++++ .../include/taskflow_/core/flow_builder.hpp | 1399 ++++++++++ myxpcs/include/taskflow_/core/graph.hpp | 1017 +++++++ myxpcs/include/taskflow_/core/notifier.hpp | 295 ++ myxpcs/include/taskflow_/core/observer.hpp | 1046 ++++++++ myxpcs/include/taskflow_/core/semaphore.hpp | 132 + myxpcs/include/taskflow_/core/task.hpp | 776 ++++++ myxpcs/include/taskflow_/core/taskflow.hpp | 643 +++++ myxpcs/include/taskflow_/core/topology.hpp | 62 + myxpcs/include/taskflow_/core/tsq.hpp | 441 +++ myxpcs/include/taskflow_/core/worker.hpp | 172 ++ .../include/taskflow_/cuda/algorithm/find.hpp | 294 ++ .../taskflow_/cuda/algorithm/for_each.hpp | 315 +++ .../taskflow_/cuda/algorithm/matmul.hpp | 57 + .../taskflow_/cuda/algorithm/merge.hpp | 585 ++++ .../taskflow_/cuda/algorithm/reduce.hpp | 460 ++++ .../include/taskflow_/cuda/algorithm/scan.hpp | 488 ++++ .../include/taskflow_/cuda/algorithm/sort.hpp | 506 ++++ .../taskflow_/cuda/algorithm/transform.hpp | 282 ++ .../taskflow_/cuda/algorithm/transpose.hpp | 41 + .../include/taskflow_/cuda/cuda_capturer.hpp | 724 +++++ myxpcs/include/taskflow_/cuda/cuda_device.hpp | 342 +++ myxpcs/include/taskflow_/cuda/cuda_error.hpp | 26 + .../taskflow_/cuda/cuda_execution_policy.hpp | 155 ++ myxpcs/include/taskflow_/cuda/cuda_graph.hpp | 805 ++++++ myxpcs/include/taskflow_/cuda/cuda_memory.hpp | 855 ++++++ myxpcs/include/taskflow_/cuda/cuda_meta.hpp | 452 ++++ myxpcs/include/taskflow_/cuda/cuda_object.hpp | 287 ++ .../include/taskflow_/cuda/cuda_optimizer.hpp | 404 +++ myxpcs/include/taskflow_/cuda/cuda_stream.hpp | 226 ++ myxpcs/include/taskflow_/cuda/cuda_task.hpp | 274 ++ myxpcs/include/taskflow_/cuda/cudaflow.hpp | 1024 +++++++ myxpcs/include/taskflow_/dsl/connection.hpp | 53 + myxpcs/include/taskflow_/dsl/dsl.hpp | 13 + myxpcs/include/taskflow_/dsl/meta_macro.hpp | 72 + .../include/taskflow_/dsl/task_analyzer.hpp | 40 + myxpcs/include/taskflow_/dsl/task_dsl.hpp | 104 + myxpcs/include/taskflow_/dsl/task_trait.hpp | 46 + myxpcs/include/taskflow_/dsl/tuple_utils.hpp | 43 + myxpcs/include/taskflow_/dsl/type_list.hpp | 136 + .../taskflow_/sycl/algorithm/reduce.hpp | 487 ++++ .../sycl/algorithm/sycl_for_each.hpp | 88 + .../sycl/algorithm/sycl_transform.hpp | 46 + .../taskflow_/sycl/sycl_execution_policy.hpp | 70 + myxpcs/include/taskflow_/sycl/sycl_graph.hpp | 255 ++ myxpcs/include/taskflow_/sycl/sycl_meta.hpp | 517 ++++ myxpcs/include/taskflow_/sycl/sycl_task.hpp | 209 ++ myxpcs/include/taskflow_/sycl/syclflow.hpp | 684 +++++ myxpcs/include/taskflow_/taskflow.hpp | 69 + myxpcs/include/taskflow_/utility/iterator.hpp | 22 + myxpcs/include/taskflow_/utility/macros.hpp | 17 + myxpcs/include/taskflow_/utility/math.hpp | 151 ++ .../include/taskflow_/utility/object_pool.hpp | 778 ++++++ myxpcs/include/taskflow_/utility/os.hpp | 196 ++ .../include/taskflow_/utility/serializer.hpp | 1135 ++++++++ .../include/taskflow_/utility/singleton.hpp | 33 + .../taskflow_/utility/small_vector.hpp | 1048 ++++++++ myxpcs/include/taskflow_/utility/stream.hpp | 32 + myxpcs/include/taskflow_/utility/traits.hpp | 303 +++ myxpcs/include/taskflow_/utility/uuid.hpp | 235 ++ myxpcs/source/function_call.pyx | 69 + myxpcs/source/set_integer.cpp | 30 + 81 files changed, 32416 insertions(+) create mode 100644 myxpcs/include/data.h create mode 100644 myxpcs/include/set_integer.h create mode 100644 myxpcs/include/taskflow_/algorithm/critical.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/data_pipeline.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/find.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/for_each.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/launch.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/partitioner.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/pipeline.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/reduce.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/scan.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/sort.hpp create mode 100644 myxpcs/include/taskflow_/algorithm/transform.hpp create mode 100644 myxpcs/include/taskflow_/core/async.hpp create mode 100644 myxpcs/include/taskflow_/core/async_task.hpp create mode 100644 myxpcs/include/taskflow_/core/declarations.hpp create mode 100644 myxpcs/include/taskflow_/core/environment.hpp create mode 100644 myxpcs/include/taskflow_/core/error.hpp create mode 100644 myxpcs/include/taskflow_/core/executor-module-opt.hpp create mode 100644 myxpcs/include/taskflow_/core/executor.hpp create mode 100644 myxpcs/include/taskflow_/core/flow_builder.hpp create mode 100644 myxpcs/include/taskflow_/core/graph.hpp create mode 100644 myxpcs/include/taskflow_/core/notifier.hpp create mode 100644 myxpcs/include/taskflow_/core/observer.hpp create mode 100644 myxpcs/include/taskflow_/core/semaphore.hpp create mode 100644 myxpcs/include/taskflow_/core/task.hpp create mode 100644 myxpcs/include/taskflow_/core/taskflow.hpp create mode 100644 myxpcs/include/taskflow_/core/topology.hpp create mode 100644 myxpcs/include/taskflow_/core/tsq.hpp create mode 100644 myxpcs/include/taskflow_/core/worker.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/find.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/merge.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/scan.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/sort.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/transform.hpp create mode 100644 myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_capturer.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_device.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_error.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_graph.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_memory.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_meta.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_object.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_stream.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cuda_task.hpp create mode 100644 myxpcs/include/taskflow_/cuda/cudaflow.hpp create mode 100644 myxpcs/include/taskflow_/dsl/connection.hpp create mode 100644 myxpcs/include/taskflow_/dsl/dsl.hpp create mode 100644 myxpcs/include/taskflow_/dsl/meta_macro.hpp create mode 100644 myxpcs/include/taskflow_/dsl/task_analyzer.hpp create mode 100644 myxpcs/include/taskflow_/dsl/task_dsl.hpp create mode 100644 myxpcs/include/taskflow_/dsl/task_trait.hpp create mode 100644 myxpcs/include/taskflow_/dsl/tuple_utils.hpp create mode 100644 myxpcs/include/taskflow_/dsl/type_list.hpp create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp create mode 100644 myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp create mode 100644 myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp create mode 100644 myxpcs/include/taskflow_/sycl/sycl_graph.hpp create mode 100644 myxpcs/include/taskflow_/sycl/sycl_meta.hpp create mode 100644 myxpcs/include/taskflow_/sycl/sycl_task.hpp create mode 100644 myxpcs/include/taskflow_/sycl/syclflow.hpp create mode 100644 myxpcs/include/taskflow_/taskflow.hpp create mode 100644 myxpcs/include/taskflow_/utility/iterator.hpp create mode 100644 myxpcs/include/taskflow_/utility/macros.hpp create mode 100644 myxpcs/include/taskflow_/utility/math.hpp create mode 100644 myxpcs/include/taskflow_/utility/object_pool.hpp create mode 100644 myxpcs/include/taskflow_/utility/os.hpp create mode 100644 myxpcs/include/taskflow_/utility/serializer.hpp create mode 100644 myxpcs/include/taskflow_/utility/singleton.hpp create mode 100644 myxpcs/include/taskflow_/utility/small_vector.hpp create mode 100644 myxpcs/include/taskflow_/utility/stream.hpp create mode 100644 myxpcs/include/taskflow_/utility/traits.hpp create mode 100644 myxpcs/include/taskflow_/utility/uuid.hpp create mode 100644 myxpcs/source/function_call.pyx create mode 100644 myxpcs/source/set_integer.cpp diff --git a/myxpcs/include/data.h b/myxpcs/include/data.h new file mode 100644 index 0000000..d85392b --- /dev/null +++ b/myxpcs/include/data.h @@ -0,0 +1,185 @@ +#include <vector> +#include <cstdint> +#include <cstdlib> +#include <memory> +#include <iostream> + +#include <taskflow_/taskflow.hpp> +#include <taskflow_/algorithm/for_each.hpp> + + +template <typename T> +struct Storage +{ + std::vector<std::size_t> shape{}; + T *ptr{nullptr}; + + Storage(const std::vector<std::size_t> &shape) + : shape{shape} + { + std::size_t numElements = 1; + for (auto element : shape) + { + numElements *= element; + } + + ptr = static_cast<T *>(std::aligned_alloc(sysconf(_SC_PAGESIZE), sizeof(T) * numElements)); + } + + ~Storage() + { + // std::cout << "storage freed"; + free(ptr); + } + + void printStats() + { + std::cout << "Storage Stats: " << getSize() << "\n"; + std::cout << "dim: " << shape.size() << " : "; + for (auto len : shape) + { + std::cout << len << ", "; + } + std::cout << std::endl; + } + + std::int32_t getSize() const + { + std::int32_t size = 1; + for (auto len : shape) + { + size *= len; + } + return size; + } +}; + +template <typename T> +using Memory = std::shared_ptr<Storage<T>>; + +template <typename T> +Memory<T> TranposeFromImageToTime_v3_block_tf_no_struct_one_taskflow(const Memory<T> in, std::size_t fastBlockSizeDim, std::size_t slowBlockSizeDim, tf::Executor &executor) +{ + std::cout << "bluib"; + // CDCS::Utility::ScopedConsoleMicrosecondTimer timer("transposing data block fast_write tf one taskflow< " + std::to_string(fastBlockSizeDim) + " , " + std::to_string(slowBlockSizeDim) + " >"); + + const auto dims = (*in).shape; + const std::size_t X = dims[0]; + const std::size_t Y = dims[1]; + const std::size_t Z = dims[2]; + + auto out = std::make_shared<Storage<T>>(std::vector<std::size_t>{Z, X, Y}); + + const std::size_t imagesize = X * Y; + + const std::size_t fastDim = imagesize; + const std::size_t slowDim = Z; + + T *in_ptr = in->ptr; + T *out_ptr = out->ptr; + + // add regular patches + std::size_t fastBlockPos = 0; + std::size_t slowBlockPos = 0; + + tf::Taskflow taskflow; + + if (slowDim >= slowBlockSizeDim && fastDim >= fastBlockSizeDim) + { + while (slowBlockPos + slowBlockSizeDim <= slowDim) + { + + fastBlockPos = 0; + while (fastBlockPos + fastBlockSizeDim <= fastDim) + { + + taskflow.emplace( + [in_ptr, out_ptr, fastBlockSizeDim, slowBlockSizeDim, fastBlockPos, slowBlockPos, fastDim, slowDim]() + { + for (std::size_t fast = 0; fast < fastBlockSizeDim; fast++) + { + for (std::size_t slow = 0; slow < slowBlockSizeDim; slow++) + { + out_ptr[slowBlockPos + fastBlockPos * slowDim + slow + fast * slowDim] = in_ptr[fastBlockPos + slowBlockPos * fastDim + fast + slow * fastDim]; + } + } + }); + fastBlockPos += fastBlockSizeDim; + } + + slowBlockPos += slowBlockSizeDim; + } + } + + std::size_t fastEnd = fastBlockPos; + std::size_t slowEnd = slowBlockPos; + + std::size_t fastLeftover = fastDim - fastEnd; + std::size_t slowLeftover = slowDim - slowEnd; + + // check for leftovers + if (fastLeftover != 0 && slowDim >= slowBlockSizeDim) + { + slowBlockPos = 0; + + while (slowBlockPos + slowBlockSizeDim <= slowDim) + { + taskflow.emplace( + [in_ptr, out_ptr, fastLeftover, slowBlockSizeDim, fastEnd, slowBlockPos, fastDim, slowDim]() + { + for (std::size_t fast = 0; fast < fastLeftover; fast++) + { + for (std::size_t slow = 0; slow < slowBlockSizeDim; slow++) + { + out_ptr[slowBlockPos + fastEnd * slowDim + slow + fast * slowDim] = in_ptr[fastEnd + slowBlockPos * fastDim + fast + slow * fastDim]; + } + } + }); + slowBlockPos += slowBlockSizeDim; + } + + slowBlockPos += slowBlockSizeDim; + } + + // check for leftovers + if (slowLeftover != 0 && fastDim >= fastBlockSizeDim) + { + fastBlockPos = 0; + + while (fastBlockPos + fastBlockSizeDim < fastDim) + { + taskflow.emplace( + [in_ptr, out_ptr, fastBlockSizeDim, slowLeftover, fastBlockPos, slowEnd, fastDim, slowDim]() + { + for (std::size_t fast = 0; fast < fastBlockSizeDim; fast++) + { + for (std::size_t slow = 0; slow < slowLeftover; slow++) + { + out_ptr[slowEnd + fastBlockPos * slowDim + slow + fast * slowDim] = in_ptr[fastBlockPos + slowEnd * fastDim + fast + slow * fastDim]; + } + } + }); + fastBlockPos += fastBlockSizeDim; + } + + fastBlockPos += fastBlockSizeDim; + } + + if (slowLeftover != 0 && fastLeftover != 0) + { + + taskflow.emplace( + [in_ptr, out_ptr, fastLeftover, slowLeftover, fastEnd, slowEnd, fastDim, slowDim]() + { + for (std::size_t fast = 0; fast < fastLeftover; fast++) + { + for (std::size_t slow = 0; slow < slowLeftover; slow++) + { + out_ptr[slowEnd + fastEnd * slowDim + slow + fast * slowDim] = in_ptr[fastEnd + slowEnd * fastDim + fast + slow * fastDim]; + } + } + }); + } + executor.run(taskflow).wait(); + return out; +} diff --git a/myxpcs/include/set_integer.h b/myxpcs/include/set_integer.h new file mode 100644 index 0000000..f73f460 --- /dev/null +++ b/myxpcs/include/set_integer.h @@ -0,0 +1 @@ +void computeXPCS(float* in, float* out); \ No newline at end of file diff --git a/myxpcs/include/taskflow_/algorithm/critical.hpp b/myxpcs/include/taskflow_/algorithm/critical.hpp new file mode 100644 index 0000000..c781d28 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/critical.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "../core/task.hpp" + +/** +@file critical.hpp +@brief critical include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// CriticalSection +// ---------------------------------------------------------------------------- + +/** +@class CriticalSection + +@brief class to create a critical region of limited workers to run tasks + +tf::CriticalSection is a warpper over tf::Semaphore and is specialized for +limiting the maximum concurrency over a set of tasks. +A critical section starts with an initial count representing that limit. +When a task is added to the critical section, +the task acquires and releases the semaphore internal to the critical section. +This design avoids explicit call of tf::Task::acquire and tf::Task::release. +The following example creates a critical section of one worker and adds +the five tasks to the critical section. + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; + +// create a critical section of 1 worker +tf::CriticalSection critical_section(1); + +tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); +tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); +tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); +tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; }); +tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; }); + +critical_section.add(A, B, C, D, E); + +executor.run(taskflow).wait(); +@endcode + +*/ +class CriticalSection : public Semaphore { + + public: + + /** + @brief constructs a critical region of a limited number of workers + */ + explicit CriticalSection(size_t max_workers = 1); + + /** + @brief adds a task into the critical region + */ + template <typename... Tasks> + void add(Tasks...tasks); +}; + +inline CriticalSection::CriticalSection(size_t max_workers) : + Semaphore {max_workers} { +} + +template <typename... Tasks> +void CriticalSection::add(Tasks... tasks) { + (tasks.acquire(*this), ...); + (tasks.release(*this), ...); +} + + +} // end of namespace tf. --------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp b/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp new file mode 100644 index 0000000..0393548 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/data_pipeline.hpp @@ -0,0 +1,637 @@ +#pragma once + +#include "pipeline.hpp" + + +namespace tf { + +// ---------------------------------------------------------------------------- +// Class Definition: DataPipe +// ---------------------------------------------------------------------------- + +/** +@class DataPipe + +@brief class to create a stage in a data-parallel pipeline + +A data pipe represents a stage of a data-parallel pipeline. +A data pipe can be either @em parallel direction or @em serial direction +(specified by tf::PipeType) and is associated with a callable to invoke +by the pipeline scheduler. + +You need to use the template function, tf::make_data_pipe, to create +a data pipe. The input and output types of a tf::DataPipe should be decayed types +(though the library will always decay them for you using `std::decay`) +to allow internal storage to work. +The data will be passed by reference to your callable, at which you can take +it by copy or reference. + +@code{.cpp} +tf::make_data_pipe<int, std::string>( + tf::PipeType::SERIAL, + [](int& input) {return std::to_string(input + 100);} +); +@endcode + +In addition to the data, you callable can take an additional reference +of tf::Pipeflow in the second argument to probe the runtime information +for a stage task, such as its line number and token number: + +@code{.cpp} +tf::make_data_pipe<int, std::string>( + tf::PipeType::SERIAL, + [](int& input, tf::Pipeflow& pf) { + printf("token=%lu, line=%lu\n", pf.token(), pf.line()); + return std::to_string(input + 100); + } +); +@endcode + +*/ +template <typename Input, typename Output, typename C> +class DataPipe { + + template <typename... Ps> + friend class DataPipeline; + + public: + + /** + @brief callable type of the data pipe + */ + using callable_t = C; + + /** + @brief input type of the data pipe + */ + using input_t = Input; + + /** + @brief output type of the data pipe + */ + using output_t = Output; + + /** + @brief default constructor + */ + DataPipe() = default; + + /** + @brief constructs a data pipe + + You should use the helper function, tf::make_data_pipe, + to create a DataPipe object, especially when you need tf::DataPipe + to automatically deduct the lambda type. + */ + DataPipe(PipeType d, callable_t&& callable) : + _type{d}, _callable{std::forward<callable_t>(callable)} { + } + + /** + @brief queries the type of the data pipe + + A data pipe can be either parallel (tf::PipeType::PARALLEL) or serial + (tf::PipeType::SERIAL). + */ + PipeType type() const { + return _type; + } + + /** + @brief assigns a new type to the data pipe + */ + void type(PipeType type) { + _type = type; + } + + /** + @brief assigns a new callable to the data pipe + + @tparam U callable type + @param callable a callable object constructible from the callable type + of this data pipe + + Assigns a new callable to the pipe using universal forwarding. + */ + template <typename U> + void callable(U&& callable) { + _callable = std::forward<U>(callable); + } + + private: + + PipeType _type; + + callable_t _callable; +}; + +/** +@brief function to construct a data pipe (tf::DataPipe) + +@tparam Input input data type +@tparam Output output data type +@tparam C callable type + +tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) +in a data-parallel pipeline (tf::DataPipeline). +The first argument specifies the direction of the data pipe, +either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, +and the second argument is a callable to invoke by the pipeline scheduler. +Input and output data types are specified via template parameters, +which will always be decayed by the library to its original form +for storage purpose. +The callable must take the input data type in its first argument +and returns a value of the output data type. + +@code{.cpp} +tf::make_data_pipe<int, std::string>( + tf::PipeType::SERIAL, + [](int& input) { + return std::to_string(input + 100); + } +); +@endcode + +The callable can additionally take a reference of tf::Pipeflow, +which allows you to query the runtime information of a stage task, +such as its line number and token number. + +@code{.cpp} +tf::make_data_pipe<int, std::string>( + tf::PipeType::SERIAL, + [](int& input, tf::Pipeflow& pf) { + printf("token=%lu, line=%lu\n", pf.token(), pf.line()); + return std::to_string(input + 100); + } +); +@endcode + +*/ +template <typename Input, typename Output, typename C> +auto make_data_pipe(PipeType d, C&& callable) { + return DataPipe<Input, Output, C>(d, std::forward<C>(callable)); +} + +// ---------------------------------------------------------------------------- +// Class Definition: DataPipeline +// ---------------------------------------------------------------------------- + +/** +@class DataPipeline + +@brief class to create a data-parallel pipeline scheduling framework + +@tparam Ps data pipe types + +Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object +for users to create a <i>data-parallel pipeline scheduling framework</i> +using a module task in a taskflow. +The only difference is that tf::DataPipeline provides a data abstraction +for users to quickly express dataflow in a pipeline. +The following example creates a data-parallel pipeline of three stages +that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`. + +@code{.cpp} +#include <taskflow/taskflow.hpp> +#include <taskflow/algorithm/data_pipeline.hpp> + +int main() { + + // data flow => void -> int -> std::string -> float -> void + tf::Taskflow taskflow("pipeline"); + tf::Executor executor; + + const size_t num_lines = 4; + + tf::DataPipeline pl(num_lines, + tf::make_data_pipe<void, int>(tf::PipeType::SERIAL, [&](tf::Pipeflow& pf) -> int{ + if(pf.token() == 5) { + pf.stop(); + return 0; + } + else { + return pf.token(); + } + }), + tf::make_data_pipe<int, std::string>(tf::PipeType::SERIAL, [](int& input) { + return std::to_string(input + 100); + }), + tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL, [](std::string& input) { + std::cout << input << std::endl; + }) + ); + + // build the pipeline graph using composition + taskflow.composed_of(pl).name("pipeline"); + + // dump the pipeline graph structure (with composition) + taskflow.dump(std::cout); + + // run the pipeline + executor.run(taskflow).wait(); + + return 0; +} +@endcode + +The pipeline schedules five tokens over four parallel lines in a circular fashion, +as depicted below: + +@code{.shell-session} +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +@endcode +*/ +template <typename... Ps> +class DataPipeline { + + static_assert(sizeof...(Ps)>0, "must have at least one pipe"); + + /** + @private + */ + struct Line { + std::atomic<size_t> join_counter; + }; + + /** + @private + */ + struct PipeMeta { + PipeType type; + }; + + + public: + + /** + @brief internal storage type for each data token (default std::variant) + */ + using data_t = unique_variant_t<std::variant<std::conditional_t< + std::is_void_v<typename Ps::output_t>, + std::monostate, + std::decay_t<typename Ps::output_t>>... + >>; + + /** + @brief constructs a data-parallel pipeline object + + @param num_lines the number of parallel lines + @param ps a list of pipes + + Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + DataPipeline(size_t num_lines, Ps&&... ps); + + /** + @brief constructs a data-parallel pipeline object + + @param num_lines the number of parallel lines + @param ps a tuple of pipes + + Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes stored in a std::tuple. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + constexpr size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resetting the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero as if the pipeline was just + constructed. + */ + void reset(); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of this pipeline. + */ + Graph& graph(); + + private: + + Graph _graph; + + size_t _num_tokens; + + std::tuple<Ps...> _pipes; + std::array<PipeMeta, sizeof...(Ps)> _meta; + std::vector<std::array<Line, sizeof...(Ps)>> _lines; + std::vector<Task> _tasks; + std::vector<Pipeflow> _pipeflows; + std::vector<CachelineAligned<data_t>> _buffer; + + template <size_t... I> + auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>); + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); +}; + +// constructor +template <typename... Ps> +DataPipeline<Ps...>::DataPipeline(size_t num_lines, Ps&&... ps) : + _pipes {std::make_tuple(std::forward<Ps>(ps)...)}, + _meta {PipeMeta{ps.type()}...}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines), + _buffer (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// constructor +template <typename... Ps> +DataPipeline<Ps...>::DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps) : + _pipes {std::forward<std::tuple<Ps...>>(ps)}, + _meta {_gen_meta( + std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{} + )}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines), + _buffer (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// Function: _get_meta +template <typename... Ps> +template <size_t... I> +auto DataPipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) { + return std::array{PipeMeta{std::get<I>(ps).type()}...}; +} + +// Function: num_lines +template <typename... Ps> +size_t DataPipeline<Ps...>::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template <typename... Ps> +constexpr size_t DataPipeline<Ps...>::num_pipes() const noexcept { + return sizeof...(Ps); +} + +// Function: num_tokens +template <typename... Ps> +size_t DataPipeline<Ps...>::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template <typename... Ps> +Graph& DataPipeline<Ps...>::graph() { + return _graph; +} + +// Function: reset +template <typename... Ps> +void DataPipeline<Ps...>::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l<num_lines(); l++) { + _pipeflows[l]._pipe = 0; + _pipeflows[l]._line = l; + } + + _lines[0][0].join_counter.store(0, std::memory_order_relaxed); + + for(size_t l=1; l<num_lines(); l++) { + for(size_t f=1; f<num_pipes(); f++) { + _lines[l][f].join_counter.store( + static_cast<size_t>(_meta[f].type), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f<num_pipes(); f++) { + _lines[0][f].join_counter.store(1, std::memory_order_relaxed); + } + + for(size_t l=1; l<num_lines(); l++) { + _lines[l][0].join_counter.store( + static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed + ); + } +} + +// Procedure: _on_pipe +template <typename... Ps> +void DataPipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime&) { + + visit_tuple([&](auto&& pipe){ + + using data_pipe_t = std::decay_t<decltype(pipe)>; + using callable_t = typename data_pipe_t::callable_t; + using input_t = std::decay_t<typename data_pipe_t::input_t>; + using output_t = std::decay_t<typename data_pipe_t::output_t>; + + // first pipe + if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) { + // [](tf::Pipeflow&) -> void {}, i.e., we only have one pipe + if constexpr (std::is_void_v<output_t>) { + pipe._callable(pf); + // [](tf::Pipeflow&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable(pf); + } + } + // other pipes without pipeflow in the second argument + else if constexpr (std::is_invocable_v<callable_t, std::add_lvalue_reference_t<input_t> >) { + // [](input_t&) -> void {}, i.e., the last pipe + if constexpr (std::is_void_v<output_t>) { + pipe._callable(std::get<input_t>(_buffer[pf._line].data)); + // [](input_t&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable( + std::get<input_t>(_buffer[pf._line].data) + ); + } + } + // other pipes with pipeflow in the second argument + else if constexpr (std::is_invocable_v<callable_t, input_t&, Pipeflow&>) { + // [](input_t&, tf::Pipeflow&) -> void {} + if constexpr (std::is_void_v<output_t>) { + pipe._callable(std::get<input_t>(_buffer[pf._line].data), pf); + // [](input_t&, tf::Pipeflow&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable( + std::get<input_t>(_buffer[pf._line].data), pf + ); + } + } + //else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) { + // pipe._callable(pf, rt); + //} + else { + static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type"); + } + }, _pipes, pf._pipe); +} + +// Procedure: _build +template <typename... Ps> +void DataPipeline<Ps...>::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast<int>(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _lines[pf->_line][pf->_pipe].join_counter.store( + static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed + ); + + if (pf->_pipe == 0) { + pf->_token = _num_tokens; + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + ++_num_tokens; + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array<int, 2> retval; + size_t n = 0; + + // downward dependency + if(_meta[c_f].type == PipeType::SERIAL && + _lines[n_l][c_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_lines[pf->_line][n_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + + +} // end of namespace tf ----------------------------------------------------- + + + + + diff --git a/myxpcs/include/taskflow_/algorithm/find.hpp b/myxpcs/include/taskflow_/algorithm/find.hpp new file mode 100644 index 0000000..5a52876 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/find.hpp @@ -0,0 +1,551 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: find_if_loop +template <typename Iterator, typename Predicate> +TF_FORCE_INLINE bool find_if_loop( + std::atomic<size_t>& offset, + Iterator& beg, + size_t& prev_e, + size_t curr_b, + size_t curr_e, + Predicate&& predicate +) { + // early prune + if(offset.load(std::memory_order_relaxed) < curr_b) { + return true; + } + std::advance(beg, curr_b - prev_e); + for(size_t x = curr_b; x<curr_e; x++) { + if(predicate(*beg++)) { + atomic_min(offset, x); + return true; + } + } + prev_e = curr_e; + return false; +} + +// Function: find_if_not_loop +template <typename Iterator, typename Predicate> +TF_FORCE_INLINE bool find_if_not_loop( + std::atomic<size_t>& offset, + Iterator& beg, + size_t& prev_e, + size_t curr_b, + size_t curr_e, + Predicate&& predicate +) { + + // early prune + if(offset.load(std::memory_order_relaxed) < curr_b) { + return true; + } + std::advance(beg, curr_b - prev_e); + for(size_t x = curr_b; x<curr_e; x++) { + if(!predicate(*beg++)) { + atomic_min(offset, x); + return true; + } + } + prev_e = curr_e; + return false; +} + +} // namespace detail -------------------------------------------------------- + +// Function: make_find_if_task +template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_find_if_task( + B first, E last, T& result, UOP predicate, P&& part = P() +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return + [b=first, e=last, predicate, &result, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::find_if(beg, end, predicate); + return; + } + + if(N < W) { + W = N; + } + + std::atomic<size_t> offset(N); + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + chunk_size = part.adjusted_chunk_size(N, W, w); + + launch_loop(W, w, rt, + [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] + () mutable { + part.loop_until(N, W, curr_b, chunk_size, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + return detail::find_if_loop( + offset, beg, prev_e, part_b, part_e, predicate + ); + } + ); + } + ); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, + [N, W, beg, &predicate, &offset, &next, &part] () mutable { + part.loop_until(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + return detail::find_if_loop( + offset, beg, prev_e, curr_b, curr_e, predicate + ); + } + ); + } + ); + } + + // update the result iterator by the offset + result = std::next(beg, offset.load(std::memory_order_relaxed)); + }; +} + +// Function: make_find_if_not_task +template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_find_if_not_task( + B first, E last, T& result, UOP predicate, P&& part = P() +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return + [b=first, e=last, predicate, &result, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::find_if_not(beg, end, predicate); + return; + } + + if(N < W) { + W = N; + } + + std::atomic<size_t> offset(N); + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + chunk_size = part.adjusted_chunk_size(N, W, w); + + launch_loop(W, w, rt, + [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] () mutable { + part.loop_until(N, W, curr_b, chunk_size, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + return detail::find_if_not_loop( + offset, beg, prev_e, part_b, part_e, predicate + ); + } + ); + } + ); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, + [N, W, beg, &predicate, &offset, &next, &part] () mutable { + part.loop_until(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + return detail::find_if_not_loop( + offset, beg, prev_e, curr_b, curr_e, predicate + ); + } + ); + } + ); + } + + // update the result iterator by the offset + result = std::next(beg, offset.load(std::memory_order_relaxed)); + }; +} + +// Function: make_min_element_task +template <typename B, typename E, typename T, typename C, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_min_element_task( + B first, E last, T& result, C comp, P&& part = P() +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return + [b=first, e=last, &result, comp, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::min_element(beg, end, comp); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + + // initialize the result to the first element + result = beg++; + N--; + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + // we force chunk size to be at least two because the temporary + // variable sum needs to avoid copy at the first step + chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w)); + + launch_loop(W, w, rt, + [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable { + + std::advance(beg, curr_b); + + if(N - curr_b == 1) { + std::lock_guard<std::mutex> lock(mutex); + if(comp(*beg, *result)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T smallest = comp(*beg1, *beg2) ? beg1 : beg2; + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + + if(part_b > prev_e) { + std::advance(beg, part_b - prev_e); + } + else { + part_b = prev_e; + } + + for(size_t x=part_b; x<part_e; x++, beg++) { + if(comp(*beg, *smallest)) { + smallest = beg; + } + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mutex); + if(comp(*smallest, *result)) { + result = smallest; + } + }); + } + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, + [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard<std::mutex> lock(mutex); + if(comp(*beg, *result)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T smallest = comp(*beg1, *beg2) ? beg1 : beg2; + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + for(size_t x=part_b; x<part_e; x++, beg++) { + if(comp(*beg, *smallest)) { + smallest = beg; + } + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mutex); + if(comp(*smallest, *result)) { + result = smallest; + } + } + ); + } + }; +} + +// Function: make_max_element_task +template <typename B, typename E, typename T, typename C, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_max_element_task( + B first, E last, T& result, C comp, P&& part = P() +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return + [b=first, e=last, &result, comp, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::max_element(beg, end, comp); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + + // initialize the result to the first element + result = beg++; + N--; + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + // we force chunk size to be at least two because the temporary + // variable sum needs to avoid copy at the first step + chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w)); + + launch_loop(W, w, rt, + [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable { + + std::advance(beg, curr_b); + + if(N - curr_b == 1) { + std::lock_guard<std::mutex> lock(mutex); + if(comp(*result, *beg)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T largest = comp(*beg1, *beg2) ? beg2 : beg1; + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + + if(part_b > prev_e) { + std::advance(beg, part_b - prev_e); + } + else { + part_b = prev_e; + } + + for(size_t x=part_b; x<part_e; x++, beg++) { + if(comp(*largest, *beg)) { + largest = beg; + } + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mutex); + if(comp(*result, *largest)) { + result = largest; + } + }); + } + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, + [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard<std::mutex> lock(mutex); + if(comp(*result, *beg)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T largest = comp(*beg1, *beg2) ? beg2 : beg1; + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + for(size_t x=part_b; x<part_e; x++, beg++) { + if(comp(*largest, *beg)) { + largest = beg; + } + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mutex); + if(comp(*result, *largest)) { + result = largest; + } + } + ); + } + }; +} + + + +// Function: find_if +template <typename B, typename E, typename T, typename UOP, typename P> +Task tf::FlowBuilder::find_if(B first, E last, T& result, UOP predicate, P&& part) { + return emplace(make_find_if_task( + first, last, result, predicate, std::forward<P>(part) + )); +} + +// Function: find_if_not +template <typename B, typename E, typename T, typename UOP, typename P> +Task tf::FlowBuilder::find_if_not(B first, E last, T& result, UOP predicate, P&& part) { + return emplace(make_find_if_not_task( + first, last, result, predicate, std::forward<P>(part) + )); +} + +// ---------------------------------------------------------------------------- +// min_element +// ---------------------------------------------------------------------------- + +// Function: min_element +template <typename B, typename E, typename T, typename C, typename P> +Task FlowBuilder::min_element(B first, E last, T& result, C comp, P&& part) { + return emplace(make_min_element_task( + first, last, result, comp, std::forward<P>(part) + )); +} + +// ---------------------------------------------------------------------------- +// max_element +// ---------------------------------------------------------------------------- + +// Function: max_element +template <typename B, typename E, typename T, typename C, typename P> +Task FlowBuilder::max_element(B first, E last, T& result, C comp, P&& part) { + return emplace(make_max_element_task( + first, last, result, comp, std::forward<P>(part) + )); +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/algorithm/for_each.hpp b/myxpcs/include/taskflow_/algorithm/for_each.hpp new file mode 100644 index 0000000..10e0a78 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/for_each.hpp @@ -0,0 +1,171 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +// Function: make_for_each_task +template <typename B, typename E, typename C, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_for_each_task(B b, E e, C c, P&& part = P()) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return [b, e, c, part=std::forward<P>(part)] (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::for_each(beg, end, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + chunk_size = part.adjusted_chunk_size(N, W, w); + launch_loop(W, w, rt, [=, &c, &part] () mutable { + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + c(*beg++); + } + prev_e = part_e; + } + ); + }); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + c(*beg++); + } + prev_e = part_e; + } + ); + }); + } + }; +} + +// Function: make_for_each_index_task +template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_for_each_index_task(B b, E e, S s, C c, P&& part = P()) { + + using namespace std::string_literals; + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using S_t = std::decay_t<unwrap_ref_decay_t<S>>; + + return [b, e, s, c, part=std::forward<P>(part)] (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + S_t inc = s; + + // nothing to be done if the range is invalid + if(is_range_invalid(beg, end, inc)) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(size_t x=0; x<N; x++, beg+=inc) { + c(beg); + } + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + chunk_size = part.adjusted_chunk_size(N, W, w); + launch_loop(W, w, rt, [=, &c, &part] () mutable { + part.loop(N, W, curr_b, chunk_size, + [&](size_t part_b, size_t part_e) { + auto idx = static_cast<B_t>(part_b) * inc + beg; + for(size_t x=part_b; x<part_e; x++, idx += inc) { + c(idx); + } + } + ); + }); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&](size_t part_b, size_t part_e) { + auto idx = static_cast<B_t>(part_b) * inc + beg; + for(size_t x=part_b; x<part_e; x++, idx += inc) { + c(idx); + } + } + ); + }); + } + }; +} + +// ---------------------------------------------------------------------------- +// for_each +// ---------------------------------------------------------------------------- + +// Function: for_each +template <typename B, typename E, typename C, typename P> +Task FlowBuilder::for_each(B beg, E end, C c, P&& part) { + return emplace( + make_for_each_task(beg, end, c, std::forward<P>(part)) + ); +} + +// ---------------------------------------------------------------------------- +// for_each_index +// ---------------------------------------------------------------------------- + +// Function: for_each_index +template <typename B, typename E, typename S, typename C, typename P> +Task FlowBuilder::for_each_index(B beg, E end, S inc, C c, P&& part) { + return emplace( + make_for_each_index_task(beg, end, inc, c, std::forward<P>(part)) + ); +} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/algorithm/launch.hpp b/myxpcs/include/taskflow_/algorithm/launch.hpp new file mode 100644 index 0000000..3598fd5 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/launch.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include "../core/async.hpp" + +namespace tf { + +// Function: launch_loop +template <typename P, typename Loop> +TF_FORCE_INLINE void launch_loop( + size_t N, + size_t W, + Runtime& rt, + std::atomic<size_t>& next, + P&& part, + Loop&& loop +) { + + //static_assert(std::is_lvalue_reference_v<Loop>, ""); + + using namespace std::string_literals; + + for(size_t w=0; w<W; w++) { + auto r = N - next.load(std::memory_order_relaxed); + // no more loop work to do - finished by previous async tasks + if(!r) { + break; + } + // tail optimization + if(r <= part.chunk_size() || w == W-1) { + loop(); + break; + } + else { + rt.silent_async_unchecked("loop-"s + std::to_string(w), loop); + } + } + + rt.corun_all(); +} + +// Function: launch_loop +template <typename Loop> +TF_FORCE_INLINE void launch_loop( + size_t W, + size_t w, + Runtime& rt, + Loop&& loop +) { + using namespace std::string_literals; + if(w == W-1) { + loop(); + } + else { + rt.silent_async_unchecked("loop-"s + std::to_string(w), loop); + } +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/algorithm/partitioner.hpp b/myxpcs/include/taskflow_/algorithm/partitioner.hpp new file mode 100644 index 0000000..4a253fa --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/partitioner.hpp @@ -0,0 +1,543 @@ +// reference: +// - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c +// - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp + +#pragma once + +/** +@file partitioner.hpp +@brief partitioner include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Partitioner Base +// ---------------------------------------------------------------------------- + +/** +@class PartitionerBase + +@brief class to derive a partitioner for scheduling parallel algorithms + +The class provides base methods to derive a partitioner that can be used +to schedule parallel iterations (e.g., tf::Taskflow::for_each). + +An partitioner defines the scheduling method for running parallel algorithms, +such tf::Taskflow::for_each, tf::Taskflow::reduce, and so on. +By default, we provide the following partitioners: + ++ tf::GuidedPartitioner to enable guided scheduling algorithm of adaptive chunk size ++ tf::DynamicPartitioner to enable dynamic scheduling algorithm of equal chunk size ++ tf::StaticPartitioner to enable static scheduling algorithm of static chunk size ++ tf::RandomPartitioner to enable random scheduling algorithm of random chunk size + +Depending on applications, partitioning algorithms can impact the performance +a lot. +For example, if a parallel-iteration workload contains a regular work unit per +iteration, tf::StaticPartitioner can deliver the best performance. +On the other hand, if the work unit per iteration is irregular and unbalanced, +tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner. +In most situations, tf::GuidedPartitioner can deliver decent performance and +is thus used as our default partitioner. +*/ +class PartitionerBase { + + public: + + /** + @brief default constructor + */ + PartitionerBase() = default; + + /** + @brief construct a partitioner with the given chunk size + */ + explicit PartitionerBase(size_t chunk_size) : _chunk_size {chunk_size} {} + + /** + @brief query the chunk size of this partitioner + */ + size_t chunk_size() const { return _chunk_size; } + + /** + @brief update the chunk size of this partitioner + */ + void chunk_size(size_t cz) { _chunk_size = cz; } + + protected: + + /** + @brief chunk size + */ + size_t _chunk_size{0}; +}; + +// ---------------------------------------------------------------------------- +// Guided Partitioner +// ---------------------------------------------------------------------------- + +/** +@class GuidedPartitioner + +@brief class to construct a guided partitioner for scheduling parallel algorithms + +The size of a partition is proportional to the number of unassigned iterations +divided by the number of workers, +and the size will gradually decrease to the given chunk size. +The last partition may be smaller than the chunk size. +*/ +class GuidedPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + GuidedPartitioner() : PartitionerBase{1} {} + + /** + @brief construct a guided partitioner with the given chunk size + */ + explicit GuidedPartitioner(size_t sz) : PartitionerBase (sz) {} + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr + > + void loop( + size_t N, + size_t W, + std::atomic<size_t>& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + + size_t p1 = 2 * W * (chunk_size + 1); + float p2 = 0.5f / static_cast<float>(W); + size_t curr_b = next.load(std::memory_order_relaxed); + + while(curr_b < N) { + + size_t r = N - curr_b; + + // fine-grained + if(r < p1) { + while(1) { + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(curr_b >= N) { + return; + } + func(curr_b, std::min(curr_b + chunk_size, N)); + } + break; + } + // coarse-grained + else { + size_t q = static_cast<size_t>(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + //size_t curr_e = (q <= r) ? curr_b + q : N; + size_t curr_e = std::min(curr_b + q, N); + if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, + std::memory_order_relaxed)) { + func(curr_b, curr_e); + curr_b = next.load(std::memory_order_relaxed); + } + } + } + } + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + std::atomic<size_t>& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + + size_t p1 = 2 * W * (chunk_size + 1); + float p2 = 0.5f / static_cast<float>(W); + size_t curr_b = next.load(std::memory_order_relaxed); + + while(curr_b < N) { + + size_t r = N - curr_b; + + // fine-grained + if(r < p1) { + while(1) { + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(curr_b >= N) { + return; + } + if(func(curr_b, std::min(curr_b + chunk_size, N))) { + return; + } + } + break; + } + // coarse-grained + else { + size_t q = static_cast<size_t>(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + //size_t curr_e = (q <= r) ? curr_b + q : N; + size_t curr_e = std::min(curr_b + q, N); + if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, + std::memory_order_relaxed)) { + if(func(curr_b, curr_e)) { + return; + } + curr_b = next.load(std::memory_order_relaxed); + } + } + } + } +}; + +// ---------------------------------------------------------------------------- +// Dynamic Partitioner +// ---------------------------------------------------------------------------- + +/** +@class DynamicPartitioner + +@brief class to construct a dynamic partitioner for scheduling parallel algorithms + +The partitioner splits iterations into many partitions each of size equal to +the given chunk size. +Different partitions are distributed dynamically to workers +without any specific order. +*/ +class DynamicPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + DynamicPartitioner() : PartitionerBase{1} {}; + + /** + @brief construct a dynamic partitioner with the given chunk size + */ + explicit DynamicPartitioner(size_t sz) : PartitionerBase (sz) {} + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr + > + void loop( + size_t N, + size_t, + std::atomic<size_t>& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + func(curr_b, std::min(curr_b + chunk_size, N)); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr + > + void loop_until( + size_t N, + size_t, + std::atomic<size_t>& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + if(func(curr_b, std::min(curr_b + chunk_size, N))) { + return; + } + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } +}; + +// ---------------------------------------------------------------------------- +// Static Partitioner +// ---------------------------------------------------------------------------- + +/** +@class StaticPartitioner + +@brief class to construct a dynamic partitioner for scheduling parallel algorithms + +The partitioner divides iterations into chunks and distributes chunks +to workers in order. +If the chunk size is not specified (default @c 0), the partitioner resorts to a chunk size +that equally distributes iterations into workers. + +@code{.cpp} +std::vector<int> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} +taskflow.for_each( + data.begin(), data.end(), [](int i){}, StaticPartitioner(0) +); +executor.run(taskflow).run(); +@endcode +*/ +class StaticPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + StaticPartitioner() : PartitionerBase{0} {}; + + /** + @brief construct a dynamic partitioner with the given chunk size + */ + explicit StaticPartitioner(size_t sz) : PartitionerBase(sz) {} + + /** + @brief queries the adjusted chunk size + + Returns the given chunk size if it is not zero, or returns + <tt>N/W + (w < N%W)</tt>, where @c N is the number of iterations, + @c W is the number of workers, and @c w is the worker ID. + */ + size_t adjusted_chunk_size(size_t N, size_t W, size_t w) const { + return _chunk_size ? _chunk_size : N/W + (w < N%W); + } + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr + > + void loop( + size_t N, + size_t W, + size_t curr_b, + size_t chunk_size, + F&& func + ) { + size_t stride = W * chunk_size; + while(curr_b < N) { + size_t curr_e = std::min(curr_b + chunk_size, N); + func(curr_b, curr_e); + curr_b += stride; + } + } + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + size_t curr_b, + size_t chunk_size, + F&& func + ) { + size_t stride = W * chunk_size; + while(curr_b < N) { + size_t curr_e = std::min(curr_b + chunk_size, N); + if(func(curr_b, curr_e)) { + return; + } + curr_b += stride; + } + } +}; + +// ---------------------------------------------------------------------------- +// RandomPartitioner +// ---------------------------------------------------------------------------- + +/** +@class RandomPartitioner + +@brief class to construct a random partitioner for scheduling parallel algorithms + +Similar to tf::DynamicPartitioner, +the partitioner splits iterations into many partitions but each with a random +chunk size in the range, <tt>c = [alpha * N * W, beta * N * W]</tt>. +By default, @c alpha is <tt>0.01</tt> and @c beta is <tt>0.5</tt>, respectively. + +*/ +class RandomPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + RandomPartitioner() = default; + + /** + @brief constructs a random partitioner + */ + RandomPartitioner(size_t cz) : PartitionerBase(cz) {} + + /** + @brief constructs a random partitioner with the given parameters + */ + RandomPartitioner(float alpha, float beta) : _alpha {alpha}, _beta {beta} {} + + /** + @brief queries the @c alpha value + */ + float alpha() const { return _alpha; } + + /** + @brief queries the @c beta value + */ + float beta() const { return _beta; } + + /** + @brief queries the range of chunk size + + @param N number of iterations + @param W number of workers + */ + std::pair<size_t, size_t> chunk_size_range(size_t N, size_t W) const { + + size_t b1 = static_cast<size_t>(_alpha * N * W); + size_t b2 = static_cast<size_t>(_beta * N * W); + + if(b1 > b2) { + std::swap(b1, b2); + } + + b1 = std::max(b1, size_t{1}); + b2 = std::max(b2, b1 + 1); + + return {b1, b2}; + } + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr + > + void loop( + size_t N, + size_t W, + std::atomic<size_t>& next, + F&& func + ) const { + + auto [b1, b2] = chunk_size_range(N, W); + + std::default_random_engine engine {std::random_device{}()}; + std::uniform_int_distribution<size_t> dist(b1, b2); + + size_t chunk_size = dist(engine); + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + func(curr_b, std::min(curr_b + chunk_size, N)); + chunk_size = dist(engine); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + /** + @private + */ + template <typename F, + std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + std::atomic<size_t>& next, + F&& func + ) const { + + auto [b1, b2] = chunk_size_range(N, W); + + std::default_random_engine engine {std::random_device{}()}; + std::uniform_int_distribution<size_t> dist(b1, b2); + + size_t chunk_size = dist(engine); + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + if(func(curr_b, std::min(curr_b + chunk_size, N))){ + return; + } + chunk_size = dist(engine); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + private: + + float _alpha {0.01f}; + float _beta {0.5f}; + +}; + +/** +@brief default partitioner set to tf::GuidedPartitioner + +Guided partitioner can achieve decent performance for most parallel algorithms, +especially for those with irregular and unbalanced workload per iteration. +*/ +using DefaultPartitioner = GuidedPartitioner; + +/** +@brief determines if a type is a partitioner + +A partitioner is a derived type from tf::PartitionerBase. +*/ +template <typename C> +inline constexpr bool is_partitioner_v = std::is_base_of<PartitionerBase, C>::value; + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/algorithm/pipeline.hpp b/myxpcs/include/taskflow_/algorithm/pipeline.hpp new file mode 100644 index 0000000..5442d56 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/pipeline.hpp @@ -0,0 +1,1663 @@ +#pragma once + +#include "../taskflow.hpp" + +/** +@file pipeline.hpp +@brief pipeline include file +*/ + +namespace tf { + + +// ---------------------------------------------------------------------------- +// Structure Definition: DeferredPipeflow +// ---------------------------------------------------------------------------- +// For example: +// 12.defer(7); 12.defer(16); +// _____ +// | | +// v | +// 7 12 16 +// | ^ +// |____ | +// +// DeferredPipeflow dpf of 12 : +// dpf._token = 12; +// dpf._num_deferrals = 1; +// dpf._dependents = std::list<size_t>{7,16}; +// dpf._dependent_satellites has following two entries +// {key: 7, value: dpf._dependents.begin()} +// {key: 16, value: dpf._dependents.begin()+1} +// +/** @private */ +class DeferredPipeflow { + + template <typename... Ps> + friend class Pipeline; + + template <typename P> + friend class ScalablePipeline; + + public: + + DeferredPipeflow() = default; + DeferredPipeflow(const DeferredPipeflow&) = delete; + DeferredPipeflow(DeferredPipeflow&&) = delete; + + DeferredPipeflow(size_t t, size_t n, std::unordered_set<size_t>&& dep) : + _token{t}, _num_deferrals{n}, _dependents{std::move(dep)} { + } + + DeferredPipeflow& operator = (const DeferredPipeflow&) = delete; + DeferredPipeflow& operator = (DeferredPipeflow&&) = delete; + + private: + + // token id + size_t _token; + + // number of deferrals + size_t _num_deferrals; + + // dependents + // For example, + // 12.defer(7); 12.defer(16) + // _dependents = {7, 16} + std::unordered_set<size_t> _dependents; +}; + + + +// ---------------------------------------------------------------------------- +// Class Definition: Pipeflow +// ---------------------------------------------------------------------------- + +/** +@class Pipeflow + +@brief class to create a pipeflow object used by the pipe callable + +Pipeflow represents a <i>scheduling token</i> in the pipeline scheduling +framework. A pipeflow is created by the pipeline scheduler at runtime to +pass to the pipe callable. Users can query the present statistics +of that scheduling token, including the line identifier, pipe identifier, +and token identifier, and build their application algorithms based on +these statistics. +At the first stage, users can explicitly call the stop method +to stop the pipeline scheduler. + +@code{.cpp} +tf::Pipe{tf::PipeType::SERIAL, [](tf::Pipeflow& pf){ + std::cout << "token id=" << pf.token() + << " at line=" << pf.line() + << " at pipe=" << pf.pipe() + << '\n'; +}}; +@endcode + +Pipeflow can only be created privately by the tf::Pipeline and +be used through the pipe callable. +*/ +class Pipeflow { + + template <typename... Ps> + friend class Pipeline; + + template <typename P> + friend class ScalablePipeline; + + template <typename... Ps> + friend class DataPipeline; + + public: + + /** + @brief default constructor + */ + Pipeflow() = default; + + /** + @brief queries the line identifier of the present token + */ + size_t line() const { + return _line; + } + + /** + @brief queries the pipe identifier of the present token + */ + size_t pipe() const { + return _pipe; + } + + /** + @brief queries the token identifier + */ + size_t token() const { + return _token; + } + + /** + @brief stops the pipeline scheduling + + Only the first pipe can call this method to stop the pipeline. + Calling stop from other pipes will throw exception. + */ + void stop() { + if(_pipe != 0) { + TF_THROW("only the first pipe can stop the token"); + } + _stop = true; + } + + /** + @brief queries the number of deferrals + */ + size_t num_deferrals() const { + return _num_deferrals; + } + + /** + @brief pushes token in _dependents + + Only the first pipe can call this method to defer the current + scheduling token to the given token. + */ + void defer(size_t token) { + if(_pipe != 0) { + TF_THROW("only the first pipe can defer the current scheduling token"); + } + _dependents.insert(token); + } + + private: + + // Regular data + size_t _line; + size_t _pipe; + size_t _token; + bool _stop; + + // Data field for token dependencies + size_t _num_deferrals; + std::unordered_set<size_t> _dependents; + +}; + +// ---------------------------------------------------------------------------- +// Class Definition: PipeType +// ---------------------------------------------------------------------------- + +/** +@enum PipeType + +@brief enumeration of all pipe types +*/ +enum class PipeType : int { + /** @brief parallel type */ + PARALLEL = 1, + /** @brief serial type */ + SERIAL = 2 +}; + +// ---------------------------------------------------------------------------- +// Class Definition: Pipe +// ---------------------------------------------------------------------------- + +/** +@class Pipe + +@brief class to create a pipe object for a pipeline stage + +@tparam C callable type + +A pipe represents a stage of a pipeline. A pipe can be either +@em parallel direction or @em serial direction (specified by tf::PipeType) +and is coupled with a callable to invoke by the pipeline scheduler. +The callable must take a referenced tf::Pipeflow object in the first argument: + +@code{.cpp} +Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}} +@endcode + +The pipeflow object is used to query the statistics of a scheduling token +in the pipeline, such as pipe, line, and token numbers. +*/ +template <typename C = std::function<void(tf::Pipeflow&)>> +class Pipe { + + template <typename... Ps> + friend class Pipeline; + + template <typename P> + friend class ScalablePipeline; + + public: + + /** + @brief alias of the callable type + */ + using callable_t = C; + + /** + @brief default constructor + */ + Pipe() = default; + + /** + @brief constructs the pipe object + + @param d pipe type (tf::PipeType) + @param callable callable type + + The constructor constructs a pipe with the given direction + (tf::PipeType::SERIAL or tf::PipeType::PARALLEL) and the given callable. + The callable must take a referenced tf::Pipeflow object in the first argument. + + @code{.cpp} + Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}} + @endcode + + When creating a pipeline, the direction of the first pipe must be serial + (tf::PipeType::SERIAL). + */ + Pipe(PipeType d, C&& callable) : + _type{d}, _callable{std::forward<C>(callable)} { + } + + /** + @brief queries the type of the pipe + + Returns the type of the callable. + */ + PipeType type() const { + return _type; + } + + /** + @brief assigns a new type to the pipe + + @param type a tf::PipeType variable + */ + void type(PipeType type) { + _type = type; + } + + /** + @brief assigns a new callable to the pipe + + @tparam U callable type + @param callable a callable object constructible from std::function<void(tf::Pipeflow&)> + + Assigns a new callable to the pipe with universal forwarding. + */ + template <typename U> + void callable(U&& callable) { + _callable = std::forward<U>(callable); + } + + private: + + PipeType _type; + + C _callable; +}; + +// ---------------------------------------------------------------------------- +// Class Definition: Pipeline +// ---------------------------------------------------------------------------- + +/** +@class Pipeline + +@brief class to create a pipeline scheduling framework + +@tparam Ps pipe types + +A pipeline is a composable graph object for users to create a +<i>pipeline scheduling framework</i> using a module task in a taskflow. +Unlike the conventional pipeline programming frameworks (e.g., Intel TBB), +%Taskflow's pipeline algorithm does not provide any data abstraction, +which often restricts users from optimizing data layouts in their applications, +but a flexible framework for users to customize their application data +atop our pipeline scheduling. +The following code creates a pipeline of four parallel lines to schedule +tokens through three serial pipes: + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +const size_t num_lines = 4; +const size_t num_pipes = 3; + +// create a custom data buffer +std::array<std::array<int, num_pipes>, num_lines> buffer; + +// create a pipeline graph of four concurrent lines and three serial pipes +tf::Pipeline pipeline(num_lines, + // first pipe must define a serial direction + tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf) { + // generate only 5 scheduling tokens + if(pf.token() == 5) { + pf.stop(); + } + // save the token id into the buffer + else { + buffer[pf.line()][pf.pipe()] = pf.token(); + } + }}, + tf::Pipe{tf::PipeType::SERIAL, [&buffer] (tf::Pipeflow& pf) { + // propagate the previous result to this pipe by adding one + buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1; + }}, + tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf){ + // propagate the previous result to this pipe by adding one + buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1; + }} +); + +// build the pipeline graph using composition +tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; }) + .name("starting pipeline"); +tf::Task task = taskflow.composed_of(pipeline) + .name("pipeline"); +tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; }) + .name("pipeline stopped"); + +// create task dependency +init.precede(task); +task.precede(stop); + +// run the pipeline +executor.run(taskflow).wait(); +@endcode + +The above example creates a pipeline graph that schedules five tokens over +four parallel lines in a circular fashion, as depicted below: + +@code{.shell-session} +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +@endcode + +At each pipe stage, the program propagates the result to the next pipe +by adding one to the result stored in a custom data storage, @c buffer. +The pipeline scheduler will generate five scheduling tokens and then stop. + +Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes. +The definition of each pipe can be different, completely decided by the compiler +to optimize the object layout. +After a pipeline is constructed, it is not possible to change its pipes. +If applications need to change these pipes, please use tf::ScalablePipeline. +*/ +template <typename... Ps> +class Pipeline { + + static_assert(sizeof...(Ps)>0, "must have at least one pipe"); + + /** + @private + */ + struct Line { + std::atomic<size_t> join_counter; + }; + + /** + @private + */ + struct PipeMeta { + PipeType type; + }; + + public: + + /** + @brief constructs a pipeline object + + @param num_lines the number of parallel lines + @param ps a list of pipes + + Constructs a pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + Pipeline(size_t num_lines, Ps&&... ps); + + /** + @brief constructs a pipeline object + + @param num_lines the number of parallel lines + @param ps a tuple of pipes + + Constructs a pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + Pipeline(size_t num_lines, std::tuple<Ps...>&& ps); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + constexpr size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resetting the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero as if the pipeline was just + constructed. + */ + void reset(); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of the this pipeline. + */ + Graph& graph(); + + + private: + + Graph _graph; + + size_t _num_tokens; + + std::tuple<Ps...> _pipes; + std::array<PipeMeta, sizeof...(Ps)> _meta; + std::vector<std::array<Line, sizeof...(Ps)>> _lines; + std::vector<Task> _tasks; + std::vector<Pipeflow> _pipeflows; + + // queue of ready tokens (paired with their deferral times) + // For example, + // when 12 does not have any dependents, + // we put 12 in _ready_tokens queue + // Assume num_deferrals of 12 is 1, + // we push pair{12, 1} in the queue + std::queue<std::pair<size_t, size_t>> _ready_tokens; + + // unordered_map of token dependencies + // For example, + // 12.defer(16); 13.defer(16); + // _token_dependencies has the following entry + // {key: 16, value: std::vector{12, 13}}. + std::unordered_map<size_t, std::vector<size_t>> _token_dependencies; + + // unordered_map of deferred tokens + // For example, + // 12.defer(16); 13.defer(16); + // _deferred_tokens has the following two entries + // {key: 12, DeferredPipeflow of 12} and + // {key: 13, DeferredPipeflow of 13} + std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens; + + // variable to keep track of the longest deferred tokens + // For example, + // 2.defer(16) + // 5.defer(19) + // 5.defer(17), + // _longest_deferral will be 19 - after token 19 the pipeline + // has almost zero cost on handling deferred pipeflow + size_t _longest_deferral = 0; + + template <size_t... I> + auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>); + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); + void _check_dependents(Pipeflow&); + void _construct_deferred_tokens(Pipeflow&); + void _resolve_token_dependencies(Pipeflow&); +}; + +// constructor +template <typename... Ps> +Pipeline<Ps...>::Pipeline(size_t num_lines, Ps&&... ps) : + _pipes {std::make_tuple(std::forward<Ps>(ps)...)}, + _meta {PipeMeta{ps.type()}...}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// constructor +template <typename... Ps> +Pipeline<Ps...>::Pipeline(size_t num_lines, std::tuple<Ps...>&& ps) : + _pipes {std::forward<std::tuple<Ps...>>(ps)}, + _meta {_gen_meta( + std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{} + )}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// Function: _get_meta +template <typename... Ps> +template <size_t... I> +auto Pipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) { + return std::array{PipeMeta{std::get<I>(ps).type()}...}; +} + +// Function: num_lines +template <typename... Ps> +size_t Pipeline<Ps...>::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template <typename... Ps> +constexpr size_t Pipeline<Ps...>::num_pipes() const noexcept { + return sizeof...(Ps); +} + +// Function: num_tokens +template <typename... Ps> +size_t Pipeline<Ps...>::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template <typename... Ps> +Graph& Pipeline<Ps...>::graph() { + return _graph; +} + +// Function: reset +template <typename... Ps> +void Pipeline<Ps...>::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l<num_lines(); l++) { + _pipeflows[l]._pipe = 0; + _pipeflows[l]._line = l; + + _pipeflows[l]._num_deferrals = 0; + _pipeflows[l]._dependents.clear(); + } + + assert(_ready_tokens.empty() == true); + _token_dependencies.clear(); + _deferred_tokens.clear(); + + _lines[0][0].join_counter.store(0, std::memory_order_relaxed); + + for(size_t l=1; l<num_lines(); l++) { + for(size_t f=1; f<num_pipes(); f++) { + _lines[l][f].join_counter.store( + static_cast<size_t>(_meta[f].type), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f<num_pipes(); f++) { + _lines[0][f].join_counter.store(1, std::memory_order_relaxed); + } + + for(size_t l=1; l<num_lines(); l++) { + _lines[l][0].join_counter.store( + static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed + ); + } +} + +// Procedure: _on_pipe +template <typename... Ps> +void Pipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime& rt) { + visit_tuple([&](auto&& pipe){ + using callable_t = typename std::decay_t<decltype(pipe)>::callable_t; + if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) { + pipe._callable(pf); + } + else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) { + pipe._callable(pf, rt); + } + else { + static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type"); + } + }, _pipes, pf._pipe); +} + +// Procedure: _check_dependents +// Check and remove invalid dependents after on_pipe +// For example, users may defer a pipeflow to multiple tokens, +// and we need to remove invalid tokens. +// 12.defer(7); // valid only if 7 is deferred, or invalid otherwise +// 12.defer(16); // 16 is valid +template <typename... Ps> +void Pipeline<Ps...>::_check_dependents(Pipeflow& pf) { + //if (pf._dependents.size()) { + ++pf._num_deferrals; + + for (auto it = pf._dependents.begin(); it != pf._dependents.end();) { + + // valid (e.g., 12.defer(16)) + if (*it >= _num_tokens) { + _token_dependencies[*it].push_back(pf._token); + _longest_deferral = std::max(_longest_deferral, *it); + ++it; + } + // valid or invalid (e.g., 12.defer(7)) + else { + auto pit = _deferred_tokens.find(*it); + + // valid (e.g., 7 is deferred) + if (pit != _deferred_tokens.end()) { + _token_dependencies[*it].push_back(pf._token); + ++it; + } + + // invalid (e.g., 7 is finished - this this 12.defer(7) is dummy) + else { + it = pf._dependents.erase(it); + } + } + } +} + +// Procedure: _construct_deferred_tokens +// Construct a data structure for a deferred token +// +// For example, +// 12.defer(7); 12.defer(16); +// After _check_dependents, 12 needs to be deferred, +// so we will construct a data structure for 12 using hashmap: +// {key: 12, value: DeferredPipeflow of 12} +template <typename... Ps> +void Pipeline<Ps...>::_construct_deferred_tokens(Pipeflow& pf) { + + //auto res = _deferred_tokens.emplace( + // pf._token, DeferredPipeflow{pf._token, pf._num_deferrals, std::move(pf._dependents)} + //); + + // construct the deferred pipeflow with zero copy + //auto res = _deferred_tokens.emplace( + _deferred_tokens.emplace( + std::piecewise_construct, + std::forward_as_tuple(pf._token), + std::forward_as_tuple( + pf._token, pf._num_deferrals, std::move(pf._dependents) + ) + ); + + //assert(res.second == true); +} + +// Procedure: _resolve_token_dependencies +// Resolve dependencies for tokens that defer to current token +// +// For example, +// 12.defer(16); +// 13.defer(16); +// _token_dependencies will have the entry +// {key: 16, value: std::vector{12, 13}} +// +// When 16 finishes, we need to remove 16 from 12's and 13's +// individual_dependents +template <typename... Ps> +void Pipeline<Ps...>::_resolve_token_dependencies(Pipeflow& pf) { + + if (auto it = _token_dependencies.find(pf._token); + it != _token_dependencies.end()) { + + // iterate tokens that defer to pf._token + // (e.g., 12 and 13) + for(size_t target : it->second) { + + auto dpf = _deferred_tokens.find(target); + + assert(dpf != _deferred_tokens.end()); + + // erase pf._token from target's _dependents + // (e.g., remove 16 from 12's dependents) + dpf->second._dependents.erase(pf._token); + // dpf->second._dependent_satellites[pf._token] + //); + + // target has no dependents + if (dpf->second._dependents.empty()) { + + // push target into _ready_tokens queue + _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals); + //_ready_tokens.push( + // std::make_pair(dpf->second._token, dpf->second._num_deferrals) + //); + + // erase target from _deferred_tokens + _deferred_tokens.erase(dpf); + } + } + + // remove pf._token from _token_dependencies + // (e.g., remove the entry + // {key: 16, value: std::vector{12, 13}} from _token_dependencies) + _token_dependencies.erase(it); + } +} + +// Procedure: _build +template <typename... Ps> +void Pipeline<Ps...>::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast<int>(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _lines[pf->_line][pf->_pipe].join_counter.store( + static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed + ); + + // First pipe does all jobs of initialization and token dependencies + if (pf->_pipe == 0) { + // _ready_tokens queue is not empty + // substitute pf with the token at the front of the queue + if (!_ready_tokens.empty()) { + pf->_token = _ready_tokens.front().first; + pf->_num_deferrals = _ready_tokens.front().second; + _ready_tokens.pop(); + } + else { + pf->_token = _num_tokens; + pf->_num_deferrals = 0; + } + + handle_token_dependency: + + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + + if (_num_tokens == pf->_token) { + ++_num_tokens; + } + + if (pf->_dependents.empty() == false){ + // check if the pf->_dependents have valid dependents + _check_dependents(*pf); + + // tokens in pf->_dependents are all valid dependents + if (pf->_dependents.size()) { + + // construct a data structure for pf in _deferred_tokens + _construct_deferred_tokens(*pf); + goto pipeline; + } + + // tokens in pf->_dependents are invalid dependents + // directly goto on_pipe on the same line + else { + goto handle_token_dependency; + } + } + + // Every token within the deferral range needs to check + // if it can resolve dependencies on other tokens. + if (pf->_token <= _longest_deferral) { + _resolve_token_dependencies(*pf); + } + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array<int, 2> retval; + size_t n = 0; + + // downward dependency + if(_meta[c_f].type == PipeType::SERIAL && + _lines[n_l][c_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_lines[pf->_line][n_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + // downward dependency + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + // forward dependency + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + +// ---------------------------------------------------------------------------- +// Class Definition: ScalablePipeline +// ---------------------------------------------------------------------------- + +/** +@class ScalablePipeline + +@brief class to create a scalable pipeline object + +@tparam P type of the iterator to a range of pipes + +A scalable pipeline is a composable graph object for users to create a +<i>pipeline scheduling framework</i> using a module task in a taskflow. +Unlike tf::Pipeline that instantiates all pipes upon the construction time, +tf::ScalablePipeline allows variable assignments of pipes using range iterators. +Users can also reset a scalable pipeline to a different range of pipes +between runs. The following code creates a scalable pipeline of four +parallel lines to schedule tokens through three serial pipes in a custom storage, +then resetting the pipeline to a new range of five serial pipes: + +@code{.cpp} +tf::Taskflow taskflow("pipeline"); +tf::Executor executor; + +const size_t num_lines = 4; + +// create data storage +std::array<int, num_lines> buffer; + +// define the pipe callable +auto pipe_callable = [&buffer] (tf::Pipeflow& pf) mutable { + switch(pf.pipe()) { + // first stage generates only 5 scheduling tokens and saves the + // token number into the buffer. + case 0: { + if(pf.token() == 5) { + pf.stop(); + } + else { + printf("stage 1: input token = %zu\n", pf.token()); + buffer[pf.line()] = pf.token(); + } + return; + } + break; + + // other stages propagate the previous result to this pipe and + // increment it by one + default: { + printf( + "stage %zu: input buffer[%zu] = %d\n", pf.pipe(), pf.line(), buffer[pf.line()] + ); + buffer[pf.line()] = buffer[pf.line()] + 1; + } + break; + } +}; + +// create a vector of three pipes +std::vector< tf::Pipe<std::function<void(tf::Pipeflow&)>> > pipes; + +for(size_t i=0; i<3; i++) { + pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable); +} + +// create a pipeline of four parallel lines based on the given vector of pipes +tf::ScalablePipeline pl(num_lines, pipes.begin(), pipes.end()); + +// build the pipeline graph using composition +tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; }) + .name("starting pipeline"); +tf::Task task = taskflow.composed_of(pl) + .name("pipeline"); +tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; }) + .name("pipeline stopped"); + +// create task dependency +init.precede(task); +task.precede(stop); + +// dump the pipeline graph structure (with composition) +taskflow.dump(std::cout); + +// run the pipeline +executor.run(taskflow).wait(); + +// reset the pipeline to a new range of five pipes and starts from +// the initial state (i.e., token counts from zero) +for(size_t i=0; i<2; i++) { + pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable); +} +pl.reset(pipes.begin(), pipes.end()); + +executor.run(taskflow).wait(); +@endcode + +The above example creates a pipeline graph that schedules five tokens over +four parallel lines in a circular fashion, first going through three serial pipes +and then five serial pipes: + +@code{.shell-session} +# initial construction of three serial pipes +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o + +# resetting to a new range of five serial pipes +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +@endcode + +Each pipe has the same type of `%tf::Pipe<%std::function<void(%tf::Pipeflow&)>>` +and is kept in a vector that is amenable to change. +We construct the scalable pipeline using two range iterators pointing to the +beginning and the end of the vector. +At each pipe stage, the program propagates the result to the next pipe +by adding one to the result stored in a custom data storage, @c buffer. +The pipeline scheduler will generate five scheduling tokens and then stop. + +A scalable pipeline is move-only. +*/ +template <typename P> +class ScalablePipeline { + + /** + @private + */ + struct Line { + std::atomic<size_t> join_counter; + }; + + public: + + /** + @brief pipe type + */ + using pipe_t = typename std::iterator_traits<P>::value_type; + + /** + @brief default constructor + */ + ScalablePipeline() = default; + + /** + @brief constructs an empty scalable pipeline object + + @param num_lines the number of parallel lines + + An empty scalable pipeline does not have any pipes. + The pipeline needs to be reset to a valid range of pipes + before running. + */ + ScalablePipeline(size_t num_lines); + + /** + @brief constructs a scalable pipeline object + + @param num_lines the number of parallel lines + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + Constructs a pipeline from the given range of pipes specified in + <tt>[first, last)</tt> using @c num_lines parallel lines. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + ScalablePipeline(size_t num_lines, P first, P last); + + /** + @brief disabled copy constructor + */ + ScalablePipeline(const ScalablePipeline&) = delete; + + /** + @brief move constructor + + Constructs a pipeline from the given @c rhs using move semantics + (i.e. the data in @c rhs is moved into this pipeline). + After the move, @c rhs is in a state as if it is just constructed. + The behavior is undefined if @c rhs is running during the move. + */ + ScalablePipeline(ScalablePipeline&& rhs); + + /** + @brief disabled copy assignment operator + */ + ScalablePipeline& operator = (const ScalablePipeline&) = delete; + + /** + @brief move constructor + + Replaces the contents with those of @c rhs using move semantics + (i.e. the data in @c rhs is moved into this pipeline). + After the move, @c rhs is in a state as if it is just constructed. + The behavior is undefined if @c rhs is running during the move. + */ + ScalablePipeline& operator = (ScalablePipeline&& rhs); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resets the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero. + */ + void reset(); + + /** + @brief resets the pipeline with a new range of pipes + + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + The member function assigns the pipeline to a new range of pipes + specified in <tt>[first, last)</tt> and resets the pipeline to the + initial state. After resetting a pipeline, its token identifier will + start from zero. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + void reset(P first, P last); + + /** + @brief resets the pipeline to a new line number and a + new range of pipes + + @param num_lines number of parallel lines + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + The member function resets the pipeline to a new number of + parallel lines and a new range of pipes specified in + <tt>[first, last)</tt>, as if the pipeline is just constructed. + After resetting a pipeline, its token identifier will start from zero. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + void reset(size_t num_lines, P first, P last); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of the this pipeline. + */ + Graph& graph(); + + private: + + Graph _graph; + + size_t _num_tokens{0}; + + std::vector<P> _pipes; + std::vector<Task> _tasks; + std::vector<Pipeflow> _pipeflows; + std::unique_ptr<Line[]> _lines; + + // chchiu + std::queue<std::pair<size_t, size_t>> _ready_tokens; + std::unordered_map<size_t, std::vector<size_t>> _token_dependencies; + std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens; + size_t _longest_deferral = 0; + + void _check_dependents(Pipeflow&); + void _construct_deferred_tokens(Pipeflow&); + void _resolve_token_dependencies(Pipeflow&); + // chchiu + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); + + Line& _line(size_t, size_t); +}; + +// constructor +template <typename P> +ScalablePipeline<P>::ScalablePipeline(size_t num_lines) : + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + _build(); +} + +// constructor +template <typename P> +ScalablePipeline<P>::ScalablePipeline(size_t num_lines, P first, P last) : + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + reset(first, last); + _build(); +} + +// move constructor +template <typename P> +ScalablePipeline<P>::ScalablePipeline(ScalablePipeline&& rhs) : + _graph {std::move(rhs._graph)}, + _num_tokens {rhs._num_tokens}, + _pipes {std::move(rhs._pipes)}, + _tasks {std::move(rhs._tasks)}, + _pipeflows {std::move(rhs._pipeflows)}, + _lines {std::move(rhs._lines)}, + _ready_tokens {std::move(rhs._ready_tokens)}, + _token_dependencies {std::move(rhs._token_dependencies)}, + _deferred_tokens {std::move(rhs._deferred_tokens)}, + _longest_deferral {rhs._longest_deferral}{ + + rhs._longest_deferral = 0; + rhs._num_tokens = 0; +} + +// move assignment operator +template <typename P> +ScalablePipeline<P>& ScalablePipeline<P>::operator = (ScalablePipeline&& rhs) { + _graph = std::move(rhs._graph); + _num_tokens = rhs._num_tokens; + _pipes = std::move(rhs._pipes); + _tasks = std::move(rhs._tasks); + _pipeflows = std::move(rhs._pipeflows); + _lines = std::move(rhs._lines); + rhs._num_tokens = 0; + _ready_tokens = std::move(rhs._ready_tokens); + _token_dependencies = std::move(rhs._token_dependencies); + _deferred_tokens = std::move(rhs._deferred_tokens); + _longest_deferral = rhs._longest_deferral; + rhs._longest_deferral = 0; + return *this; +} + +// Function: num_lines +template <typename P> +size_t ScalablePipeline<P>::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template <typename P> +size_t ScalablePipeline<P>::num_pipes() const noexcept { + return _pipes.size(); +} + +// Function: num_tokens +template <typename P> +size_t ScalablePipeline<P>::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template <typename P> +Graph& ScalablePipeline<P>::graph() { + return _graph; +} + +// Function: _line +template <typename P> +typename ScalablePipeline<P>::Line& ScalablePipeline<P>::_line(size_t l, size_t p) { + return _lines[l*num_pipes() + p]; +} + +template <typename P> +void ScalablePipeline<P>::reset(size_t num_lines, P first, P last) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + _graph.clear(); + _tasks.resize(num_lines + 1); + _pipeflows.resize(num_lines); + + reset(first, last); + + _build(); +} + +// Function: reset +template <typename P> +void ScalablePipeline<P>::reset(P first, P last) { + + size_t num_pipes = static_cast<size_t>(std::distance(first, last)); + + if(num_pipes == 0) { + TF_THROW("pipeline cannot be empty"); + } + + if(first->type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + _pipes.resize(num_pipes); + + size_t i=0; + for(auto itr = first; itr != last; itr++) { + _pipes[i++] = itr; + } + + _lines = std::make_unique<Line[]>(num_lines() * _pipes.size()); + + reset(); +} + +// Function: reset +template <typename P> +void ScalablePipeline<P>::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l<num_lines(); l++) { + _pipeflows[l]._pipe = 0; + _pipeflows[l]._line = l; + _pipeflows[l]._num_deferrals = 0; + _pipeflows[l]._dependents.clear(); + } + + _line(0, 0).join_counter.store(0, std::memory_order_relaxed); + + for(size_t l=1; l<num_lines(); l++) { + for(size_t f=1; f<num_pipes(); f++) { + _line(l, f).join_counter.store( + static_cast<size_t>(_pipes[f]->type()), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f<num_pipes(); f++) { + _line(0, f).join_counter.store(1, std::memory_order_relaxed); + } + + for(size_t l=1; l<num_lines(); l++) { + _line(l, 0).join_counter.store( + static_cast<size_t>(_pipes[0]->type()) - 1, std::memory_order_relaxed + ); + } + + assert(_ready_tokens.empty() == true); + _token_dependencies.clear(); + _deferred_tokens.clear(); +} + +// Procedure: _on_pipe +template <typename P> +void ScalablePipeline<P>::_on_pipe(Pipeflow& pf, Runtime& rt) { + + using callable_t = typename pipe_t::callable_t; + + if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) { + _pipes[pf._pipe]->_callable(pf); + } + else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) { + _pipes[pf._pipe]->_callable(pf, rt); + } + else { + static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type"); + } +} + +template <typename P> +void ScalablePipeline<P>::_check_dependents(Pipeflow& pf) { + ++pf._num_deferrals; + + for (auto it = pf._dependents.begin(); it != pf._dependents.end();) { + + // valid (e.g., 12.defer(16)) + if (*it >= _num_tokens) { + _token_dependencies[*it].push_back(pf._token); + _longest_deferral = std::max(_longest_deferral, *it); + ++it; + } + // valid or invalid (e.g., 12.defer(7)) + else { + auto pit = _deferred_tokens.find(*it); + + // valid (e.g., 7 is deferred) + if (pit != _deferred_tokens.end()) { + _token_dependencies[*it].push_back(pf._token); + ++it; + } + + else { + it = pf._dependents.erase(it); + } + } + } +} + +// Procedure: _construct_deferred_tokens +// Construct a data structure for a deferred token +template <typename P> +void ScalablePipeline<P>::_construct_deferred_tokens(Pipeflow& pf) { + + // construct the deferred pipeflow with zero copy + _deferred_tokens.emplace( + std::piecewise_construct, + std::forward_as_tuple(pf._token), + std::forward_as_tuple( + pf._token, pf._num_deferrals, std::move(pf._dependents) + ) + ); +} + +// Procedure: _resolve_token_dependencies +// Resolve dependencies for tokens that defer to current token +template <typename P> +void ScalablePipeline<P>::_resolve_token_dependencies(Pipeflow& pf) { + + if (auto it = _token_dependencies.find(pf._token); + it != _token_dependencies.end()) { + + // iterate tokens that defer to pf._token + for(size_t target : it->second) { + + auto dpf = _deferred_tokens.find(target); + + assert(dpf != _deferred_tokens.end()); + + // erase pf._token from target's _dependents + dpf->second._dependents.erase(pf._token); + + // target has no dependents + if (dpf->second._dependents.empty()) { + _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals); + _deferred_tokens.erase(dpf); + } + } + + _token_dependencies.erase(it); + } +} + +// Procedure: _build +template <typename P> +void ScalablePipeline<P>::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast<int>(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _line(pf->_line, pf->_pipe).join_counter.store( + static_cast<size_t>(_pipes[pf->_pipe]->type()), std::memory_order_relaxed + ); + + // First pipe does all jobs of initialization and token dependencies + if (pf->_pipe == 0) { + // _ready_tokens queue is not empty + // substitute pf with the token at the front of the queue + if (!_ready_tokens.empty()) { + pf->_token = _ready_tokens.front().first; + pf->_num_deferrals = _ready_tokens.front().second; + _ready_tokens.pop(); + } + else { + pf->_token = _num_tokens; + pf->_num_deferrals = 0; + } + + handle_token_dependency: + + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + + if (_num_tokens == pf->_token) { + ++_num_tokens; + } + + if (pf->_dependents.empty() == false){ + // check if the pf->_dependents have valid dependents + _check_dependents(*pf); + + // tokens in pf->_dependents are all valid dependents + if (pf->_dependents.size()) { + + // construct a data structure for pf in _deferred_tokens + _construct_deferred_tokens(*pf); + goto pipeline; + } + + // tokens in pf->_dependents are invalid dependents + // directly goto on_pipe on the same line + else { + goto handle_token_dependency; + } + } + + // Every token within the deferral range needs to check + // if it can resolve dependencies on other tokens. + if (pf->_token <= _longest_deferral) { + _resolve_token_dependencies(*pf); + } + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array<int, 2> retval; + size_t n = 0; + + // downward dependency + if(_pipes[c_f]->type() == PipeType::SERIAL && + _line(n_l, c_f).join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_line(pf->_line, n_f).join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + diff --git a/myxpcs/include/taskflow_/algorithm/reduce.hpp b/myxpcs/include/taskflow_/algorithm/reduce.hpp new file mode 100644 index 0000000..5ee492b --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/reduce.hpp @@ -0,0 +1,443 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +// Function: make_reduce_task +template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner> +TF_FORCE_INLINE auto make_reduce_task(B b, E e, T& init, O bop, P&& part = P()) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return + [b, e, &r=init, bop, part=std::forward<P>(part)] (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(; beg!=end; r = bop(r, *beg++)); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mtx; + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + // we force chunk size to be at least two because the temporary + // variable sum need to avoid copy at the first step + chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w)); + + launch_loop(W, w, rt, [=, &bop, &mtx, &r, &part] () mutable { + + std::advance(beg, curr_b); + + if(N - curr_b == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T sum = bop(*beg1, *beg2); + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t part_b, size_t part_e) mutable { + + if(part_b > prev_e) { + std::advance(beg, part_b - prev_e); + } + else { + part_b = prev_e; + } + + for(size_t x=part_b; x<part_e; x++, beg++) { + sum = bop(sum, *beg); + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop(r, sum); + + }); + } + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, [=, &bop, &mtx, &next, &r, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(*beg1, *beg2); + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x<curr_e; x++, beg++) { + sum = bop(sum, *beg); + } + prev_e = curr_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop(r, sum); + }); + } + }; +} + +// Function: make_transform_reduce_task +template < + typename B, typename E, typename T, typename BOP, typename UOP, + typename P = GuidedPartitioner +> +TF_FORCE_INLINE auto make_transform_reduce_task( + B b, E e, T& init, BOP bop, UOP uop, P&& part = P() +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using namespace std::string_literals; + + return [b, e, &r=init, bop, uop, part=std::forward<P>(part)] (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(; beg!=end; r = bop(std::move(r), uop(*beg++))); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mtx; + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + chunk_size = part.adjusted_chunk_size(N, W, w); + + launch_loop(W, w, rt, [=, &bop, &uop, &mtx, &r, &part] () mutable { + + std::advance(beg, curr_b); + + if(N - curr_b == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop(std::move(r), uop(*beg)); + return; + } + + //auto beg1 = beg++; + //auto beg2 = beg++; + //T sum = bop(uop(*beg1), uop(*beg2)); + + T sum = (chunk_size == 1) ? uop(*beg++) : bop(uop(*beg++), uop(*beg++)); + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] + (size_t part_b, size_t part_e) mutable { + if(part_b > prev_e) { + std::advance(beg, part_b - prev_e); + } + else { + part_b = prev_e; + } + for(size_t x=part_b; x<part_e; x++, beg++) { + sum = bop(std::move(sum), uop(*beg)); + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop(std::move(r), std::move(sum)); + + }); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + + launch_loop(N, W, rt, next, part, [=, &bop, &uop, &mtx, &next, &r, &part] () mutable { + + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop(std::move(r), uop(*beg)); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(uop(*beg1), uop(*beg2)); + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x<curr_e; x++, beg++) { + sum = bop(std::move(sum), uop(*beg)); + } + prev_e = curr_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop(std::move(r), std::move(sum)); + }); + } + }; +} + +// Function: make_transform_reduce_task with two binary operation +template < + typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, + typename P = GuidedPartitioner, + std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>* = nullptr +> +TF_FORCE_INLINE auto make_transform_reduce_task( + B1 b1, E1 e1, B2 b2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part = P() +) { + + using B1_t = std::decay_t<unwrap_ref_decay_t<B1>>; + using E1_t = std::decay_t<unwrap_ref_decay_t<E1>>; + using B2_t = std::decay_t<unwrap_ref_decay_t<B2>>; + using namespace std::string_literals; + + return + [b1, e1, b2, &r=init, bop_r, bop_t, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B1_t beg1 = b1; + E1_t end1 = e1; + B2_t beg2 = b2; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg1, end1); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(; beg1!=end1; r = bop_r(std::move(r), bop_t(*beg1++, *beg2++))); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mtx; + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + + chunk_size = part.adjusted_chunk_size(N, W, w); + + launch_loop(W, w, rt, [=, &bop_r, &bop_t, &mtx, &r, &part] () mutable { + + std::advance(beg1, curr_b); + std::advance(beg2, curr_b); + + if(N - curr_b == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop_r(std::move(r), bop_t(*beg1, *beg2)); + return; + } + + T sum = (chunk_size == 1) ? bop_t(*beg1++, *beg2++) : + bop_r(bop_t(*beg1++, *beg2++), bop_t(*beg1++, *beg2++)); + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] + (size_t part_b, size_t part_e) mutable { + if(part_b > prev_e) { + std::advance(beg1, part_b - prev_e); + std::advance(beg2, part_b - prev_e); + } + else { + part_b = prev_e; + } + for(size_t x=part_b; x<part_e; x++, beg1++, beg2++) { + sum = bop_r(std::move(sum), bop_t(*beg1, *beg2)); + } + prev_e = part_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop_r(std::move(r), std::move(sum)); + + }); + } + + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + + launch_loop(N, W, rt, next, part, [=, &bop_r, &bop_t, &mtx, &next, &r, &part] () mutable { + + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg1, s0); + std::advance(beg2, s0); + + if(N - s0 == 1) { + std::lock_guard<std::mutex> lock(mtx); + r = bop_r(std::move(r), bop_t(*beg1, *beg2)); + return; + } + + auto beg11 = beg1++; + auto beg12 = beg1++; + auto beg21 = beg2++; + auto beg22 = beg2++; + + T sum = bop_r(bop_t(*beg11, *beg21), bop_t(*beg12, *beg22)); + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg1, curr_b - prev_e); + std::advance(beg2, curr_b - prev_e); + for(size_t x=curr_b; x<curr_e; x++, beg1++, beg2++) { + sum = bop_r(std::move(sum), bop_t(*beg1, *beg2)); + } + prev_e = curr_e; + } + ); + + // final reduce + std::lock_guard<std::mutex> lock(mtx); + r = bop_r(std::move(r), std::move(sum)); + }); + } + }; +} + +// ---------------------------------------------------------------------------- +// default reduction +// ---------------------------------------------------------------------------- + +// Function: reduce +template <typename B, typename E, typename T, typename O, typename P> +Task FlowBuilder::reduce(B beg, E end, T& init, O bop, P&& part) { + return emplace(make_reduce_task(beg, end, init, bop, std::forward<P>(part))); +} + +// ---------------------------------------------------------------------------- +// default transform and reduction +// ---------------------------------------------------------------------------- + +// Function: transform_reduce +template <typename B, typename E, typename T, typename BOP, typename UOP, typename P, + std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* +> +Task FlowBuilder::transform_reduce( + B beg, E end, T& init, BOP bop, UOP uop, P&& part +) { + return emplace(make_transform_reduce_task( + beg, end, init, bop, uop, std::forward<P>(part) + )); +} + +// Function: transform_reduce +template < + typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, + typename P, + std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>* +> +Task FlowBuilder::transform_reduce( + B1 beg1, E1 end1, B2 beg2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part +) { + return emplace(make_transform_reduce_task( + beg1, end1, beg2, init, bop_r, bop_t, std::forward<P>(part) + )); +} + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/myxpcs/include/taskflow_/algorithm/scan.hpp b/myxpcs/include/taskflow_/algorithm/scan.hpp new file mode 100644 index 0000000..5a7f01b --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/scan.hpp @@ -0,0 +1,617 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: scan_loop +template <typename Iterator, typename BufferT, typename B> +TF_FORCE_INLINE void scan_loop( + tf::Runtime& rt, + std::atomic<size_t>& counter, + BufferT& buf, + B&& bop, + Iterator d_beg, + size_t W, + size_t w, + size_t chunk_size +){ + // whoever finishes the last performs global scan + if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) { + for(size_t i=1; i<buf.size(); i++) { + buf[i].data = bop(buf[i-1].data, buf[i].data); + } + counter.store(0, std::memory_order_release); + } + + // first worker no need to do any work + if(w==0) { + return; + } + + // need to do public corun because multiple workers can call this + rt.executor().corun_until([&counter](){ + return counter.load(std::memory_order_acquire) == 0; + }); + + // block addup + for(size_t i=0; i<chunk_size; i++) { + *d_beg++ = bop(buf[w-1].data, *d_beg); + } +} + +} // end of namespace tf::detail --------------------------------------------- + + +// Function: make_inclusive_scan_task +template <typename B, typename E, typename D, typename BOP> +TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::inclusive_scan(s_beg, s_end, d_beg, bop); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + size_t Q = N/W; + size_t R = N%W; + + //auto orig_d_beg = d_beg; + //ExecutionPolicy<StaticPartitioner> policy; + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& init = buf[w].data; + *d_beg++ = init = *s_beg++; + + for(size_t i=1; i<chunk_size; i++){ + *d_beg++ = init = bop(init, *s_beg++); + } + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + + //size_t offset = R ? Q + 1 : Q; + //size_t rest = N - offset; + //size_t rest_Q = rest / W; + //size_t rest_R = rest % W; + // + //chunk_size = policy.chunk_size() == 0 ? + // rest_Q + (w < rest_R) : policy.chunk_size(); + // + //size_t curr_b = policy.chunk_size() == 0 ? + // offset + (w<rest_R ? w*(rest_Q + 1) : rest_R + w*rest_Q) : + // offset + w*policy.chunk_size(); + + //policy(N, W, curr_b, chunk_size, + // [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + // std::advance(orig_d_beg, curr_b - prev_e); + // for(size_t x = curr_b; x<curr_e; x++) { + // size_t j = x < (Q+1)*R ? x/(Q+1) : (x-(Q+1)*R)/Q + R; + // *orig_d_beg++ = bop(buf[j-1].data, *orig_d_beg); + // } + // prev_e = curr_e; + // } + //); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + }; +} + +// Function: make_inclusive_scan_task +template <typename B, typename E, typename D, typename BOP, typename T> +TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop, T init) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::inclusive_scan(s_beg, s_end, d_beg, bop, init); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + // set up the initial value for the first worker + buf[0].data = std::move(init); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& local = buf[w].data; + *d_beg++ = local = (w == 0) ? bop(local, *s_beg++) : *s_beg++; + + for(size_t i=1; i<chunk_size; i++){ + *d_beg++ = local = bop(local, *s_beg++); + } + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + }; +} + +// ---------------------------------------------------------------------------- +// Transform Inclusive Scan +// ---------------------------------------------------------------------------- + +// Function: transform_inclusive_scan +template <typename B, typename E, typename D, typename BOP, typename UOP> +TF_FORCE_INLINE auto make_transform_inclusive_scan_task( + B first, E last, D d_first, BOP bop, UOP uop +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& init = buf[w].data; + *d_beg++ = init = uop(*s_beg++); + + for(size_t i=1; i<chunk_size; i++){ + *d_beg++ = init = bop(init, uop(*s_beg++)); + } + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + }; +} + +// Function: transform_inclusive_scan +template <typename B, typename E, typename D, typename BOP, typename UOP, typename T> +TF_FORCE_INLINE auto make_transform_inclusive_scan_task( + B first, E last, D d_first, BOP bop, UOP uop, T init +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + // set up the initial value for the first worker + buf[0].data = std::move(init); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& local = buf[w].data; + *d_beg++ = local = (w == 0) ? bop(local, uop(*s_beg++)) : uop(*s_beg++); + + for(size_t i=1; i<chunk_size; i++){ + *d_beg++ = local = bop(local, uop(*s_beg++)); + } + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + + }; +} + +// ---------------------------------------------------------------------------- +// Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: make_exclusive_scan_task +template <typename B, typename E, typename D, typename T, typename BOP> +TF_FORCE_INLINE auto make_exclusive_scan_task( + B first, E last, D d_first, T init, BOP bop +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::exclusive_scan(s_beg, s_end, d_beg, init, bop); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + size_t Q = N/W; + size_t R = N%W; + + // fetch the init value + auto s_beg_temp = s_beg; + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + chunk_size = std::min(Q + (w<R), N - curr_b); + buf[w].data = w ? *s_beg_temp : std::move(init); + std::advance(s_beg_temp, chunk_size - !w); + curr_b += chunk_size; + } + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& local = buf[w].data; + + for(size_t i=1; i<chunk_size; i++) { + auto v = local; + local = bop(local, *s_beg++); + *d_beg++ = std::move(v); + } + *d_beg++ = local; + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + + }; +} + +// ---------------------------------------------------------------------------- +// Transform Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: +template <typename B, typename E, typename D, typename T, typename BOP, typename UOP> +TF_FORCE_INLINE auto make_transform_exclusive_scan_task( + B first, E last, D d_first, T init, BOP bop, UOP uop +) { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using D_t = std::decay_t<unwrap_ref_decay_t<D>>; + using value_type = typename std::iterator_traits<B_t>::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop); + return; + } + + if(N < W) { + W = N; + } + + std::vector<CachelineAligned<value_type>> buf(W); + std::atomic<size_t> counter(0); + + size_t Q = N/W; + size_t R = N%W; + + // fetch the init value + auto s_beg_temp = s_beg; + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + chunk_size = std::min(Q + (w<R), N - curr_b); + buf[w].data = w ? uop(*s_beg_temp) : std::move(init); + std::advance(s_beg_temp, chunk_size - !w); + curr_b += chunk_size; + } + + for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) { + + chunk_size = std::min(Q + (w < R), N - curr_b); + + // block scan + launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable { + + auto result = d_beg; + + // local scan per worker + auto& local = buf[w].data; + + for(size_t i=1; i<chunk_size; i++) { + auto v = local; + local = bop(local, uop(*s_beg++)); + *d_beg++ = std::move(v); + } + *d_beg++ = local; + + // block scan + detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size); + }); + + std::advance(s_beg, chunk_size); + std::advance(d_beg, chunk_size); + curr_b += chunk_size; + } + + rt.corun_all(); + + }; +} + + +// ---------------------------------------------------------------------------- +// Inclusive Scan +// ---------------------------------------------------------------------------- + +// Function: inclusive_scan +template <typename B, typename E, typename D, typename BOP> +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop) { + return emplace(make_inclusive_scan_task( + first, last, d_first, bop + )); +} + +// Function: inclusive_scan +template <typename B, typename E, typename D, typename BOP, typename T> +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init) { + return emplace(make_inclusive_scan_task( + first, last, d_first, bop, init + )); +} + +// ---------------------------------------------------------------------------- +// Transform Inclusive Scan +// ---------------------------------------------------------------------------- + +// Function: transform_inclusive_scan +template <typename B, typename E, typename D, typename BOP, typename UOP> +Task FlowBuilder::transform_inclusive_scan( + B first, E last, D d_first, BOP bop, UOP uop +) { + return emplace(make_transform_inclusive_scan_task( + first, last, d_first, bop, uop + )); +} + +// Function: transform_inclusive_scan +template <typename B, typename E, typename D, typename BOP, typename UOP, typename T> +Task FlowBuilder::transform_inclusive_scan( + B first, E last, D d_first, BOP bop, UOP uop, T init +) { + return emplace(make_transform_inclusive_scan_task( + first, last, d_first, bop, uop, init + )); +} + +// ---------------------------------------------------------------------------- +// Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: exclusive_scan +template <typename B, typename E, typename D, typename T, typename BOP> +Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop) { + return emplace(make_exclusive_scan_task( + first, last, d_first, init, bop + )); +} + +// ---------------------------------------------------------------------------- +// Transform Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: transform_exclusive_scan +template <typename B, typename E, typename D, typename T, typename BOP, typename UOP> +Task FlowBuilder::transform_exclusive_scan( + B first, E last, D d_first, T init, BOP bop, UOP uop +) { + return emplace(make_transform_exclusive_scan_task( + first, last, d_first, init, bop, uop + )); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/algorithm/sort.hpp b/myxpcs/include/taskflow_/algorithm/sort.hpp new file mode 100644 index 0000000..4460f8f --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/sort.hpp @@ -0,0 +1,661 @@ +#pragma once + +#include "../core/async.hpp" + +namespace tf::detail { + +// threshold whether or not to perform parallel sort +template <typename I> +constexpr size_t parallel_sort_cutoff() { + + //using value_type = std::decay_t<decltype(*std::declval<I>())>; + using value_type = typename std::iterator_traits<I>::value_type; + + constexpr size_t object_size = sizeof(value_type); + + if constexpr(std::is_same_v<value_type, std::string>) { + return 65536 / sizeof(std::string); + } + else { + if constexpr(object_size < 16) return 4096; + else if constexpr(object_size < 32) return 2048; + else if constexpr(object_size < 64) return 1024; + else if constexpr(object_size < 128) return 768; + else if constexpr(object_size < 256) return 512; + else if constexpr(object_size < 512) return 256; + else return 128; + } +} + +// ---------------------------------------------------------------------------- +// pattern-defeating quick sort (pdqsort) +// https://github.com/orlp/pdqsort/ +// ---------------------------------------------------------------------------- + +template<typename T, size_t cacheline_size=64> +inline T* align_cacheline(T* p) { +#if defined(UINTPTR_MAX) && __cplusplus >= 201103L + std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p); +#else + std::size_t ip = reinterpret_cast<std::size_t>(p); +#endif + ip = (ip + cacheline_size - 1) & -cacheline_size; + return reinterpret_cast<T*>(ip); +} + +template<typename Iter> +inline void swap_offsets( + Iter first, Iter last, + unsigned char* offsets_l, unsigned char* offsets_r, + size_t num, bool use_swaps +) { + typedef typename std::iterator_traits<Iter>::value_type T; + if (use_swaps) { + // This case is needed for the descending distribution, where we need + // to have proper swapping for pdqsort to remain O(n). + for (size_t i = 0; i < num; ++i) { + std::iter_swap(first + offsets_l[i], last - offsets_r[i]); + } + } else if (num > 0) { + Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0]; + T tmp(std::move(*l)); *l = std::move(*r); + for (size_t i = 1; i < num; ++i) { + l = first + offsets_l[i]; *r = std::move(*l); + r = last - offsets_r[i]; *l = std::move(*r); + } + *r = std::move(tmp); + } +} + +// Sorts [begin, end) using insertion sort with the given comparison function. +template<typename RandItr, typename Compare> +void insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits<RandItr>::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first to avoid 2 moves for an element + // already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + *shift = std::move(tmp); + } + } +} + +// Sorts [begin, end) using insertion sort with the given comparison function. +// Assumes *(begin - 1) is an element smaller than or equal to any element +// in [begin, end). +template<typename RandItr, typename Compare> +void unguarded_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits<RandItr>::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + } + } +} + +// Attempts to use insertion sort on [begin, end). +// Will return false if more than +// partial_insertion_sort_limit elements were moved, +// and abort sorting. Otherwise it will successfully sort and return true. +template<typename RandItr, typename Compare> +bool partial_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits<RandItr>::value_type; + using D = typename std::iterator_traits<RandItr>::difference_type; + + // When we detect an already sorted partition, attempt an insertion sort + // that allows this amount of element moves before giving up. + constexpr auto partial_insertion_sort_limit = D{8}; + + if (begin == end) return true; + + auto limit = D{0}; + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + if (limit > partial_insertion_sort_limit) { + return false; + } + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + limit += cur - shift; + } + } + + return true; +} + +// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal +// to the pivot are put in the right-hand partition. Returns the position of the pivot after +// partitioning and whether the passed sequence already was correctly partitioned. Assumes the +// pivot is a median of at least 3 elements and that [begin, end) is at least +// insertion_sort_threshold long. Uses branchless partitioning. +template<typename Iter, typename Compare> +std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) { + + typedef typename std::iterator_traits<Iter>::value_type T; + + constexpr size_t block_size = 64; + constexpr size_t cacheline_size = 64; + + // Move pivot into local for speed. + T pivot(std::move(*begin)); + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot (the median of 3 guarantees + // this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. We have to guard this search if + // there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while ( !comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition are the same element, + // the passed in sequence already was correctly partitioned. + bool already_partitioned = first >= last; + if (!already_partitioned) { + std::iter_swap(first, last); + ++first; + + // The following branchless partitioning is derived from "BlockQuicksort: How Branch + // Mispredictions don't affect Quicksort" by Stefan Edelkamp and Armin Weiss, but + // heavily micro-optimized. + unsigned char offsets_l_storage[block_size + cacheline_size]; + unsigned char offsets_r_storage[block_size + cacheline_size]; + unsigned char* offsets_l = align_cacheline(offsets_l_storage); + unsigned char* offsets_r = align_cacheline(offsets_r_storage); + + Iter offsets_l_base = first; + Iter offsets_r_base = last; + size_t num_l, num_r, start_l, start_r; + num_l = num_r = start_l = start_r = 0; + + while (first < last) { + // Fill up offset blocks with elements that are on the wrong side. + // First we determine how much elements are considered for each offset block. + size_t num_unknown = last - first; + size_t left_split = num_l == 0 ? (num_r == 0 ? num_unknown / 2 : num_unknown) : 0; + size_t right_split = num_r == 0 ? (num_unknown - left_split) : 0; + + // Fill the offset blocks. + if (left_split >= block_size) { + for (size_t i = 0; i < block_size;) { + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + } + } else { + for (size_t i = 0; i < left_split;) { + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + } + } + + if (right_split >= block_size) { + for (size_t i = 0; i < block_size;) { + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + } + } else { + for (size_t i = 0; i < right_split;) { + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + } + } + + // Swap elements and update block sizes and first/last boundaries. + size_t num = std::min(num_l, num_r); + swap_offsets( + offsets_l_base, offsets_r_base, + offsets_l + start_l, offsets_r + start_r, + num, num_l == num_r + ); + num_l -= num; num_r -= num; + start_l += num; start_r += num; + + if (num_l == 0) { + start_l = 0; + offsets_l_base = first; + } + + if (num_r == 0) { + start_r = 0; + offsets_r_base = last; + } + } + + // We have now fully identified [first, last)'s proper position. Swap the last elements. + if (num_l) { + offsets_l += start_l; + while (num_l--) std::iter_swap(offsets_l_base + offsets_l[num_l], --last); + first = last; + } + if (num_r) { + offsets_r += start_r; + while (num_r--) std::iter_swap(offsets_r_base - offsets_r[num_r], first), ++first; + last = first; + } + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return std::make_pair(pivot_pos, already_partitioned); +} + +// Partitions [begin, end) around pivot *begin using comparison function comp. +// Elements equal to the pivot are put in the right-hand partition. +// Returns the position of the pivot after partitioning and whether the passed +// sequence already was correctly partitioned. +// Assumes the pivot is a median of at least 3 elements and that [begin, end) +// is at least insertion_sort_threshold long. +template<typename Iter, typename Compare> +std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) { + + using T = typename std::iterator_traits<Iter>::value_type; + + // Move pivot into local for speed. + T pivot(std::move(*begin)); + + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot + // (the median of 3 guarantees/ this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. + // We have to guard this search if there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while (!comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition + // are the same element, the passed in sequence already was correctly + // partitioned. + bool already_partitioned = first >= last; + + // Keep swapping pairs of elements that are on the wrong side of the pivot. + // Previously swapped pairs guard the searches, + // which is why the first iteration is special-cased above. + while (first < last) { + std::iter_swap(first, last); + while (comp(*++first, pivot)); + while (!comp(*--last, pivot)); + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return std::make_pair(pivot_pos, already_partitioned); +} + +// Similar function to the one above, except elements equal to the pivot +// are put to the left of the pivot and it doesn't check or return +// if the passed sequence already was partitioned. +// Since this is rarely used (the many equal case), +// and in that case pdqsort already has O(n) performance, +// no block quicksort is applied here for simplicity. +template<typename RandItr, typename Compare> +RandItr partition_left(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits<RandItr>::value_type; + + T pivot(std::move(*begin)); + + RandItr first = begin; + RandItr last = end; + + while (comp(pivot, *--last)); + + if (last + 1 == end) { + while (first < last && !comp(pivot, *++first)); + } + else { + while (!comp(pivot, *++first)); + } + + while (first < last) { + std::iter_swap(first, last); + while (comp(pivot, *--last)); + while (!comp(pivot, *++first)); + } + + RandItr pivot_pos = last; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return pivot_pos; +} + +template<typename Iter, typename Compare, bool Branchless> +void parallel_pdqsort( + tf::Runtime& rt, + Iter begin, Iter end, Compare comp, + int bad_allowed, bool leftmost = true +) { + + // Partitions below this size are sorted sequentially + constexpr auto cutoff = parallel_sort_cutoff<Iter>(); + + // Partitions below this size are sorted using insertion sort + constexpr auto insertion_sort_threshold = 24; + + // Partitions above this size use Tukey's ninther to select the pivot. + constexpr auto ninther_threshold = 128; + + //using diff_t = typename std::iterator_traits<Iter>::difference_type; + + // Use a while loop for tail recursion elimination. + while (true) { + + //diff_t size = end - begin; + size_t size = end - begin; + + // Insertion sort is faster for small arrays. + if (size < insertion_sort_threshold) { + if (leftmost) { + insertion_sort(begin, end, comp); + } + else { + unguarded_insertion_sort(begin, end, comp); + } + return; + } + + if(size <= cutoff) { + std::sort(begin, end, comp); + return; + } + + // Choose pivot as median of 3 or pseudomedian of 9. + //diff_t s2 = size / 2; + size_t s2 = size >> 1; + if (size > ninther_threshold) { + sort3(begin, begin + s2, end - 1, comp); + sort3(begin + 1, begin + (s2 - 1), end - 2, comp); + sort3(begin + 2, begin + (s2 + 1), end - 3, comp); + sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp); + std::iter_swap(begin, begin + s2); + } + else { + sort3(begin + s2, begin, end - 1, comp); + } + + // If *(begin - 1) is the end of the right partition + // of a previous partition operation, there is no element in [begin, end) + // that is smaller than *(begin - 1). + // Then if our pivot compares equal to *(begin - 1) we change strategy, + // putting equal elements in the left partition, + // greater elements in the right partition. + // We do not have to recurse on the left partition, + // since it's sorted (all equal). + if (!leftmost && !comp(*(begin - 1), *begin)) { + begin = partition_left(begin, end, comp) + 1; + continue; + } + + // Partition and get results. + const auto pair = Branchless ? partition_right_branchless(begin, end, comp) : + partition_right(begin, end, comp); + + const auto pivot_pos = pair.first; + const auto already_partitioned = pair.second; + + // Check for a highly unbalanced partition. + //diff_t l_size = pivot_pos - begin; + //diff_t r_size = end - (pivot_pos + 1); + const size_t l_size = pivot_pos - begin; + const size_t r_size = end - (pivot_pos + 1); + const bool highly_unbalanced = l_size < size / 8 || r_size < size / 8; + + // If we got a highly unbalanced partition we shuffle elements + // to break many patterns. + if (highly_unbalanced) { + // If we had too many bad partitions, switch to heapsort + // to guarantee O(n log n). + if (--bad_allowed == 0) { + std::make_heap(begin, end, comp); + std::sort_heap(begin, end, comp); + return; + } + + if (l_size >= insertion_sort_threshold) { + std::iter_swap(begin, begin + l_size / 4); + std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4); + if (l_size > ninther_threshold) { + std::iter_swap(begin + 1, begin + (l_size / 4 + 1)); + std::iter_swap(begin + 2, begin + (l_size / 4 + 2)); + std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1)); + std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2)); + } + } + + if (r_size >= insertion_sort_threshold) { + std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4)); + std::iter_swap(end - 1, end - r_size / 4); + if (r_size > ninther_threshold) { + std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4)); + std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4)); + std::iter_swap(end - 2, end - (1 + r_size / 4)); + std::iter_swap(end - 3, end - (2 + r_size / 4)); + } + } + } + // decently balanced + else { + // sequence try to use insertion sort. + if (already_partitioned && + partial_insertion_sort(begin, pivot_pos, comp) && + partial_insertion_sort(pivot_pos + 1, end, comp) + ) { + return; + } + } + + // Sort the left partition first using recursion and + // do tail recursion elimination for the right-hand partition. + rt.silent_async( + [&rt, begin, pivot_pos, comp, bad_allowed, leftmost] () mutable { + parallel_pdqsort<Iter, Compare, Branchless>( + rt, begin, pivot_pos, comp, bad_allowed, leftmost + ); + } + ); + begin = pivot_pos + 1; + leftmost = false; + } +} + +// ---------------------------------------------------------------------------- +// 3-way quick sort +// ---------------------------------------------------------------------------- + +// 3-way quick sort +template <typename RandItr, typename C> +void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) { + + using namespace std::string_literals; + + constexpr auto cutoff = parallel_sort_cutoff<RandItr>(); + + sort_partition: + + if(static_cast<size_t>(last - first) < cutoff) { + std::sort(first, last+1, compare); + return; + } + + auto m = pseudo_median_of_nine(first, last, compare); + + if(m != first) { + std::iter_swap(first, m); + } + + auto l = first; + auto r = last; + auto f = std::next(first, 1); + bool is_swapped_l = false; + bool is_swapped_r = false; + + while(f <= r) { + if(compare(*f, *l)) { + is_swapped_l = true; + std::iter_swap(l, f); + l++; + f++; + } + else if(compare(*l, *f)) { + is_swapped_r = true; + std::iter_swap(r, f); + r--; + } + else { + f++; + } + } + + if(l - first > 1 && is_swapped_l) { + //rt.emplace([&](tf::Runtime& rtl) mutable { + // parallel_3wqsort(rtl, first, l-1, compare); + //}); + rt.silent_async([&rt, first, l, &compare] () mutable { + parallel_3wqsort(rt, first, l-1, compare); + }); + } + + if(last - r > 1 && is_swapped_r) { + //rt.emplace([&](tf::Runtime& rtr) mutable { + // parallel_3wqsort(rtr, r+1, last, compare); + //}); + //rt.silent_async([&rt, r, last, &compare] () mutable { + // parallel_3wqsort(rt, r+1, last, compare); + //}); + first = r+1; + goto sort_partition; + } + + //rt.join(); +} + +} // end of namespace tf::detail --------------------------------------------- + +namespace tf { + +// Function: make_sort_task +template <typename B, typename E, typename C> +TF_FORCE_INLINE auto make_sort_task(B b, E e, C cmp) { + + return [b, e, cmp] (Runtime& rt) mutable { + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + if(beg == end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= detail::parallel_sort_cutoff<B_t>()) { + std::sort(beg, end, cmp); + return; + } + + //parallel_3wqsort(rt, beg, end-1, cmp); + detail::parallel_pdqsort<B_t, C, + is_std_compare_v<std::decay_t<C>> && + std::is_arithmetic_v<typename std::iterator_traits<B_t>::value_type> + >(rt, beg, end, cmp, log2(end - beg)); + + rt.corun_all(); + }; +} + +template <typename B, typename E> +TF_FORCE_INLINE auto make_sort_task(B beg, E end) { + using value_type = std::decay_t<decltype(*std::declval<B>())>; + return make_sort_task(beg, end, std::less<value_type>{}); +} + +// ---------------------------------------------------------------------------- +// tf::Taskflow::sort +// ---------------------------------------------------------------------------- + +// Function: sort +template <typename B, typename E, typename C> +Task FlowBuilder::sort(B beg, E end, C cmp) { + return emplace(make_sort_task(beg, end, cmp)); +} + +// Function: sort +template <typename B, typename E> +Task FlowBuilder::sort(B beg, E end) { + return emplace(make_sort_task(beg, end)); +} + +} // namespace tf ------------------------------------------------------------ + diff --git a/myxpcs/include/taskflow_/algorithm/transform.hpp b/myxpcs/include/taskflow_/algorithm/transform.hpp new file mode 100644 index 0000000..37157b3 --- /dev/null +++ b/myxpcs/include/taskflow_/algorithm/transform.hpp @@ -0,0 +1,199 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +// Function: make_transform_task +template < + typename B, typename E, typename O, typename C, typename P = GuidedPartitioner, + std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr +> +TF_FORCE_INLINE auto make_transform_task( + B first1, E last1, O d_first, C c, P&& part = P() +) { + + using namespace std::string_literals; + + using B_t = std::decay_t<unwrap_ref_decay_t<B>>; + using E_t = std::decay_t<unwrap_ref_decay_t<E>>; + using O_t = std::decay_t<unwrap_ref_decay_t<O>>; + + return + [first1, last1, d_first, c, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = first1; + E_t end = last1; + O_t d_beg = d_first; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::transform(beg, end, d_beg, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + chunk_size = part.adjusted_chunk_size(N, W, w); + launch_loop(W, w, rt, [=, &part] () mutable { + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + std::advance(d_beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + *d_beg++ = c(*beg++); + } + prev_e = part_e; + } + ); + }); + } + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + + launch_loop(N, W, rt, next, part, [=, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg, part_b - prev_e); + std::advance(d_beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + *d_beg++ = c(*beg++); + } + prev_e = part_e; + } + ); + }); + } + }; +} + +// Function: make_transform_task +template < + typename B1, typename E1, typename B2, typename O, typename C, typename P = GuidedPartitioner, + std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr +> +TF_FORCE_INLINE auto make_transform_task( + B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P() +) { + + using namespace std::string_literals; + + using B1_t = std::decay_t<unwrap_ref_decay_t<B1>>; + using E1_t = std::decay_t<unwrap_ref_decay_t<E1>>; + using B2_t = std::decay_t<unwrap_ref_decay_t<B2>>; + using O_t = std::decay_t<unwrap_ref_decay_t<O>>; + + return + [first1, last1, first2, d_first, c, part=std::forward<P>(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B1_t beg1 = first1; + E1_t end1 = last1; + B2_t beg2 = first2; + O_t d_beg = d_first; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg1, end1); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::transform(beg1, end1, beg2, d_beg, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) { + chunk_size = part.adjusted_chunk_size(N, W, w); + launch_loop(W, w, rt, [=, &c, &part] () mutable { + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg1, part_b - prev_e); + std::advance(beg2, part_b - prev_e); + std::advance(d_beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + *d_beg++ = c(*beg1++, *beg2++); + } + prev_e = part_e; + } + ); + }); + } + rt.corun_all(); + } + // dynamic partitioner + else { + std::atomic<size_t> next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t part_b, size_t part_e) mutable { + std::advance(beg1, part_b - prev_e); + std::advance(beg2, part_b - prev_e); + std::advance(d_beg, part_b - prev_e); + for(size_t x = part_b; x<part_e; x++) { + *d_beg++ = c(*beg1++, *beg2++); + } + prev_e = part_e; + } + ); + }); + } + }; +} + +// ---------------------------------------------------------------------------- +// transform +// ---------------------------------------------------------------------------- + +// Function: transform +template <typename B, typename E, typename O, typename C, typename P, + std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* +> +Task FlowBuilder::transform(B first1, E last1, O d_first, C c, P&& part) { + return emplace( + make_transform_task(first1, last1, d_first, c, std::forward<P>(part)) + ); +} + +// ---------------------------------------------------------------------------- +// transform2 +// ---------------------------------------------------------------------------- + +// Function: transform +template < + typename B1, typename E1, typename B2, typename O, typename C, typename P, + std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* +> +Task FlowBuilder::transform( + B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part +) { + return emplace(make_transform_task( + first1, last1, first2, d_first, c, std::forward<P>(part) + )); +} + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/core/async.hpp b/myxpcs/include/taskflow_/core/async.hpp new file mode 100644 index 0000000..e55082c --- /dev/null +++ b/myxpcs/include/taskflow_/core/async.hpp @@ -0,0 +1,330 @@ +#pragma once + +#include "executor.hpp" + +// https://hackmd.io/@sysprog/concurrency-atomics + +namespace tf { + +// ---------------------------------------------------------------------------- +// Async +// ---------------------------------------------------------------------------- + +// Function: async +template <typename F> +auto Executor::async(const std::string& name, F&& f) { + + _increment_topology(); + + using R = std::invoke_result_t<std::decay_t<F>>; + + std::packaged_task<R()> p(std::forward<F>(f)); + auto fu{p.get_future()}; + + auto node = node_pool.animate( + name, 0, nullptr, nullptr, 0, std::in_place_type_t<Node::Async>{}, + [p=make_moc(std::move(p))]() mutable { p.object(); } + ); + + _schedule_async_task(node); + + return fu; +} + +// Function: async +template <typename F> +auto Executor::async(F&& f) { + return async("", std::forward<F>(f)); +} + +// ---------------------------------------------------------------------------- +// Silent Async +// ---------------------------------------------------------------------------- + +// Function: silent_async +template <typename F> +void Executor::silent_async(const std::string& name, F&& f) { + + _increment_topology(); + + auto node = node_pool.animate( + name, 0, nullptr, nullptr, 0, std::in_place_type_t<Node::Async>{}, + std::forward<F>(f) + ); + + _schedule_async_task(node); +} + +// Function: silent_async +template <typename F> +void Executor::silent_async(F&& f) { + silent_async("", std::forward<F>(f)); +} + +// ---------------------------------------------------------------------------- +// Async Helper Methods +// ---------------------------------------------------------------------------- + +// Procedure: _schedule_async_task +inline void Executor::_schedule_async_task(Node* node) { + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else{ + _schedule(node); + } +} + +// Procedure: _tear_down_async +inline void Executor::_tear_down_async(Node* node) { + // from runtime + if(node->_parent) { + node->_parent->_join_counter.fetch_sub(1, std::memory_order_release); + } + // from executor + else { + _decrement_topology(); + } + node_pool.recycle(node); +} + +// ---------------------------------------------------------------------------- +// Silent Dependent Async +// ---------------------------------------------------------------------------- + +// Function: silent_dependent_async +template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* +> +tf::AsyncTask Executor::silent_dependent_async(F&& func, Tasks&&... tasks) { + return silent_dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...); +} + +// Function: silent_dependent_async +template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* +> +tf::AsyncTask Executor::silent_dependent_async( + const std::string& name, F&& func, Tasks&&... tasks +){ + + _increment_topology(); + + size_t num_dependents = sizeof...(Tasks); + + // create a task before scheduling the node to retain a shared ownership first + AsyncTask task(node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func) + )); + + if constexpr(sizeof...(Tasks) > 0) { + (_process_async_dependent(task._node, tasks, num_dependents), ...); + } + + if(num_dependents == 0) { + _schedule_async_task(task._node); + } + + return task; +} + +// Function: silent_dependent_async +template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* +> +tf::AsyncTask Executor::silent_dependent_async(F&& func, I first, I last) { + return silent_dependent_async("", std::forward<F>(func), first, last); +} + +// Function: silent_dependent_async +template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* +> +tf::AsyncTask Executor::silent_dependent_async( + const std::string& name, F&& func, I first, I last +) { + + _increment_topology(); + + size_t num_dependents = std::distance(first, last); + + AsyncTask task(node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func) + )); + + for(; first != last; first++){ + _process_async_dependent(task._node, *first, num_dependents); + } + + if(num_dependents == 0) { + _schedule_async_task(task._node); + } + + return task; +} + +// ---------------------------------------------------------------------------- +// Dependent Async +// ---------------------------------------------------------------------------- + +// Function: dependent_async +template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* +> +auto Executor::dependent_async(F&& func, Tasks&&... tasks) { + return dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...); +} + +// Function: dependent_async +template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* +> +auto Executor::dependent_async( + const std::string& name, F&& func, Tasks&&... tasks +) { + + _increment_topology(); + + using R = std::invoke_result_t<std::decay_t<F>>; + + std::packaged_task<R()> p(std::forward<F>(func)); + auto fu{p.get_future()}; + + size_t num_dependents = sizeof...(tasks); + + AsyncTask task(node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t<Node::DependentAsync>{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + )); + + if constexpr(sizeof...(Tasks) > 0) { + (_process_async_dependent(task._node, tasks, num_dependents), ...); + } + + if(num_dependents == 0) { + _schedule_async_task(task._node); + } + + return std::make_pair(std::move(task), std::move(fu)); +} + +// Function: dependent_async +template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* +> +auto Executor::dependent_async(F&& func, I first, I last) { + return dependent_async("", std::forward<F>(func), first, last); +} + +// Function: dependent_async +template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* +> +auto Executor::dependent_async( + const std::string& name, F&& func, I first, I last +) { + + _increment_topology(); + + using R = std::invoke_result_t<std::decay_t<F>>; + + std::packaged_task<R()> p(std::forward<F>(func)); + auto fu{p.get_future()}; + + size_t num_dependents = std::distance(first, last); + + AsyncTask task(node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t<Node::DependentAsync>{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + )); + + for(; first != last; first++) { + _process_async_dependent(task._node, *first, num_dependents); + } + + if(num_dependents == 0) { + _schedule_async_task(task._node); + } + + return std::make_pair(std::move(task), std::move(fu)); +} + +// ---------------------------------------------------------------------------- +// Dependent Async Helper Functions +// ---------------------------------------------------------------------------- + +// Procedure: _process_async_dependent +inline void Executor::_process_async_dependent( + Node* node, tf::AsyncTask& task, size_t& num_dependents +) { + + auto& state = std::get_if<Node::DependentAsync>(&(task._node->_handle))->state; + + add_successor: + + auto target = Node::AsyncState::UNFINISHED; + + // acquires the lock + if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + task._node->_successors.push_back(node); + state.store(Node::AsyncState::UNFINISHED, std::memory_order_release); + } + // dep's state is FINISHED, which means dep finished its callable already + // thus decrement the node's join counter by 1 + else if (target == Node::AsyncState::FINISHED) { + num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; + } + // another worker adding its async task to the same successors of this node + else { + goto add_successor; + } +} + + +// Procedure: _tear_down_dependent_async +inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) { + + auto handle = std::get_if<Node::DependentAsync>(&(node->_handle)); + + // this async task comes from Executor + auto target = Node::AsyncState::UNFINISHED; + + while(!handle->state.compare_exchange_weak(target, Node::AsyncState::FINISHED, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + target = Node::AsyncState::UNFINISHED; + } + + // spaw successors whenever their dependencies are resolved + worker._cache = nullptr; + for(size_t i=0; i<node->_successors.size(); ++i) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1 + ) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + } + } + + // now the executor no longer needs to retain ownership + if(handle->use_count.fetch_sub(1, std::memory_order_acq_rel) == 1) { + node_pool.recycle(node); + } + + _decrement_topology(); +} + + + + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/core/async_task.hpp b/myxpcs/include/taskflow_/core/async_task.hpp new file mode 100644 index 0000000..026e8cb --- /dev/null +++ b/myxpcs/include/taskflow_/core/async_task.hpp @@ -0,0 +1,209 @@ +#pragma once + +#include "graph.hpp" + +/** +@file async_task.hpp +@brief asynchronous task include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// AsyncTask +// ---------------------------------------------------------------------------- + +/** +@brief class to create a dependent asynchronous task + +A tf::AsyncTask is a lightweight handle that retains @em shared ownership +of a dependent async task created by an executor. +This shared ownership ensures that the async task remains alive when +adding it to the dependency list of another async task, +thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). + +@code{.cpp} +// main thread retains shared ownership of async task A +tf::AsyncTask A = executor.silent_dependent_async([](){}); + +// task A remains alive (i.e., at least one ref count by the main thread) +// when being added to the dependency list of async task B +tf::AsyncTask B = executor.silent_dependent_async([](){}, A); +@endcode + +Currently, tf::AsyncTask is implemented based on the logic of +C++ smart pointer std::shared_ptr and +is considered cheap to copy or move as long as only a handful of objects +own it. +When a worker completes an async task, it will remove the task from the executor, +decrementing the number of shared owners by one. +If that counter reaches zero, the task is destroyed. +*/ +class AsyncTask { + + friend class Executor; + + public: + + /** + @brief constructs an empty task handle + */ + AsyncTask() = default; + + /** + @brief destroys the managed asynchronous task if this is the last owner + */ + ~AsyncTask(); + + /** + @brief constructs an asynchronous task that shares ownership of @c rhs + */ + AsyncTask(const AsyncTask& rhs); + + /** + @brief move-constructs an asynchronous task from @c rhs + */ + AsyncTask(AsyncTask&& rhs); + + /** + @brief copy-assigns the asynchronous task from @c rhs + + Releases the managed object of @c this and retains a new shared ownership + of @c rhs. + */ + AsyncTask& operator = (const AsyncTask& rhs); + + /** + @brief move-assigns the asynchronous task from @c rhs + + Releases the managed object of @c this and takes over the ownership of @c rhs. + */ + AsyncTask& operator = (AsyncTask&& rhs); + + /** + @brief checks if the asynchronous task stores nothing + */ + bool empty() const; + + /** + @brief release the managed object of @c this + */ + void reset(); + + /** + @brief obtains a hash value of this asynchronous task + */ + size_t hash_value() const; + + /** + @brief returns the number of shared owners that are currently managing + this asynchronous task + */ + size_t use_count() const; + + /** + @brief returns the boolean indicating whether the async task is done + */ + bool is_done() const; + + private: + + explicit AsyncTask(Node*); + + Node* _node {nullptr}; + + void _incref(); + void _decref(); +}; + +// Constructor +inline AsyncTask::AsyncTask(Node* ptr) : _node{ptr} { + _incref(); +} + +// Function: _incref +inline void AsyncTask::_incref() { + if(_node) { + std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.fetch_add( + 1, std::memory_order_relaxed + ); + } +} + +// Function: _decref +inline void AsyncTask::_decref() { + if(_node && std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.fetch_sub( + 1, std::memory_order_acq_rel + ) == 1) { + node_pool.recycle(_node); + } +} + +// Copy Constructor +inline AsyncTask::AsyncTask(const AsyncTask& rhs) : + _node{rhs._node} { + _incref(); +} + +// Move Constructor +inline AsyncTask::AsyncTask(AsyncTask&& rhs) : + _node {rhs._node} { + rhs._node = nullptr; +} + +// Destructor +inline AsyncTask::~AsyncTask() { + _decref(); +} + +// Copy assignment +inline AsyncTask& AsyncTask::operator = (const AsyncTask& rhs) { + _decref(); + _node = rhs._node; + _incref(); + return *this; +} + +// Move assignment +inline AsyncTask& AsyncTask::operator = (AsyncTask&& rhs) { + _decref(); + _node = rhs._node; + rhs._node = nullptr; + return *this; +} + +// Function: empty +inline bool AsyncTask::empty() const { + return _node == nullptr; +} + +// Function: reset +inline void AsyncTask::reset() { + _decref(); + _node = nullptr; +} + +// Function: hash_value +inline size_t AsyncTask::hash_value() const { + return std::hash<Node*>{}(_node); +} + +// Function: use_count +inline size_t AsyncTask::use_count() const { + return _node == nullptr ? size_t{0} : + std::get_if<Node::DependentAsync>(&(_node->_handle))->use_count.load( + std::memory_order_relaxed + ); +} + +// Function: is_done +inline bool AsyncTask::is_done() const { + return std::get_if<Node::DependentAsync>(&(_node->_handle))->state.load( + std::memory_order_acquire + ) == Node::AsyncState::FINISHED; +} + +} // end of namespace tf ---------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/core/declarations.hpp b/myxpcs/include/taskflow_/core/declarations.hpp new file mode 100644 index 0000000..dd89ab3 --- /dev/null +++ b/myxpcs/include/taskflow_/core/declarations.hpp @@ -0,0 +1,60 @@ +#pragma once + +namespace tf { + +// ---------------------------------------------------------------------------- +// taskflow +// ---------------------------------------------------------------------------- +class AsyncTopology; +class Node; +class Graph; +class FlowBuilder; +class Semaphore; +class Subflow; +class Runtime; +class Task; +class TaskView; +class Taskflow; +class Topology; +class TopologyBase; +class Executor; +class Worker; +class WorkerView; +class ObserverInterface; +class ChromeTracingObserver; +class TFProfObserver; +class TFProfManager; + +template <typename T> +class Future; + +template <typename...Fs> +class Pipeline; + +// ---------------------------------------------------------------------------- +// cudaFlow +// ---------------------------------------------------------------------------- +class cudaFlowNode; +class cudaFlowGraph; +class cudaTask; +class cudaFlow; +class cudaFlowCapturer; +class cudaFlowOptimizerBase; +class cudaFlowLinearOptimizer; +class cudaFlowSequentialOptimizer; +class cudaFlowRoundRobinOptimizer; + +// ---------------------------------------------------------------------------- +// syclFlow +// ---------------------------------------------------------------------------- +class syclNode; +class syclGraph; +class syclTask; +class syclFlow; + + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/myxpcs/include/taskflow_/core/environment.hpp b/myxpcs/include/taskflow_/core/environment.hpp new file mode 100644 index 0000000..f9013b6 --- /dev/null +++ b/myxpcs/include/taskflow_/core/environment.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define TF_ENABLE_PROFILER "TF_ENABLE_PROFILER" + +namespace tf { + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/core/error.hpp b/myxpcs/include/taskflow_/core/error.hpp new file mode 100644 index 0000000..6a68bea --- /dev/null +++ b/myxpcs/include/taskflow_/core/error.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include <iostream> +#include <sstream> +#include <exception> + +#include "../utility/stream.hpp" + +namespace tf { + +// Procedure: throw_se +// Throws the system error under a given error code. +template <typename... ArgsT> +//void throw_se(const char* fname, const size_t line, Error::Code c, ArgsT&&... args) { +void throw_re(const char* fname, const size_t line, ArgsT&&... args) { + std::ostringstream oss; + oss << "[" << fname << ":" << line << "] "; + //ostreamize(oss, std::forward<ArgsT>(args)...); + (oss << ... << args); + throw std::runtime_error(oss.str()); +} + +} // ------------------------------------------------------------------------ + +#define TF_THROW(...) tf::throw_re(__FILE__, __LINE__, __VA_ARGS__); + diff --git a/myxpcs/include/taskflow_/core/executor-module-opt.hpp b/myxpcs/include/taskflow_/core/executor-module-opt.hpp new file mode 100644 index 0000000..0e2b1ee --- /dev/null +++ b/myxpcs/include/taskflow_/core/executor-module-opt.hpp @@ -0,0 +1,2025 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future<void> fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future<void> run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future<void> run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename C> + tf::Future<void> run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename C> + tf::Future<void> run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future<void> run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future<void> run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename C> + tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename C> + tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename P> + tf::Future<void> run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename P> + tf::Future<void> run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename P, typename C> + tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename P, typename C> + tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief wait for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + /** + @brief runs a given function asynchronously + + @tparam F callable type + @tparam ArgsT parameter types + + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + The method creates an asynchronous task to launch the given + function on the given arguments. + Unlike std::async, the return here is a @em tf::Future that holds + an optional object to the result. + If the asynchronous task is cancelled before it runs, the return is + a @c std::nullopt, or the value returned by the callable. + + @code{.cpp} + tf::Future<std::optional<int>> future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... ArgsT> + auto async(F&& f, ArgsT&&... args); + + /** + @brief runs a given function asynchronously and gives a name to this task + + @tparam F callable type + @tparam ArgsT parameter types + + @param name name of the asynchronous task + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + The method creates a named asynchronous task to launch the given + function on the given arguments. + Naming an asynchronous task is primarily used for profiling and visualizing + the task execution timeline. + Unlike std::async, the return here is a tf::Future that holds + an optional object to the result. + If the asynchronous task is cancelled before it runs, the return is + a @c std::nullopt, or the value returned by the callable. + + @code{.cpp} + tf::Future<std::optional<int>> future = executor.named_async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... ArgsT> + auto named_async(const std::string& name, F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Executor::async but does not return a future object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when there is no data returned. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... ArgsT> + void silent_async(F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Executor::named_async but does not return a future object + + This member function is more efficient than tf::Executor::named_async + and is encouraged to use when there is no data returned. + + @code{.cpp} + executor.named_silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... ArgsT> + void named_silent_async(const std::string& name, F&& f, ArgsT&&... args); + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template <typename Observer, typename... ArgsT> + std::shared_ptr<Observer> make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template <typename Observer> + void remove_observer(std::shared_ptr<Observer> observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + private: + + std::condition_variable _topology_cv; + std::mutex _taskflow_mutex; + std::mutex _topology_mutex; + std::mutex _wsq_mutex; + + size_t _num_topologies {0}; + + std::unordered_map<std::thread::id, size_t> _wids; + std::vector<Worker> _workers; + std::vector<std::thread> _threads; + std::list<Taskflow> _taskflows; + + Notifier _notifier; + + TaskQueue<Node*> _wsq; + + std::atomic<size_t> _num_actives {0}; + std::atomic<size_t> _num_thieves {0}; + std::atomic<bool> _done {0}; + + std::unordered_set<std::shared_ptr<ObserverInterface>> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _worker_loop(Worker&); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _consume_task(Worker&, Node*); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector<Node*>&); + void _schedule(const SmallVector<Node*>&); + void _set_up_topology(Worker*, Topology*); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_invoke(Worker&, Node*); + void _cancel_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _decrement_topology_and_notify(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_dynamic_task(Worker&, Node*); + void _invoke_dynamic_task_external(Worker&, Node*, Graph&, bool); + void _invoke_dynamic_task_internal(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector<int>&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&); + void _invoke_module_task(Worker&, Node*, bool&); + void _invoke_module_task_internal(Worker&, Node*, Graph&, bool&); + void _invoke_async_task(Worker&, Node*); + void _invoke_silent_async_task(Worker&, Node*); + void _invoke_cudaflow_task(Worker&, Node*); + void _invoke_syclflow_task(Worker&, Node*); + void _invoke_runtime_task(Worker&, Node*); + + template <typename C, + std::enable_if_t<is_cudaflow_task_v<C>, void>* = nullptr + > + void _invoke_cudaflow_task_entry(Node*, C&&); + + template <typename C, typename Q, + std::enable_if_t<is_syclflow_task_v<C>, void>* = nullptr + > + void _invoke_syclflow_task_entry(Node*, C&&, Q&); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _workers {N}, + _notifier {N} { + + if(N == 0) { + TF_THROW("no cpu workers to execute taskflows"); + } + + _spawn(N); + + // instantite the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer<TFProfObserver>()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + _done = true; + + _notifier.notify(true); + + for(auto& t : _threads){ + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { + return _num_topologies; +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: named_async +template <typename F, typename... ArgsT> +auto Executor::named_async(const std::string& name, F&& f, ArgsT&&... args) { + + _increment_topology(); + + using T = std::invoke_result_t<F, ArgsT...>; + using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>; + + std::promise<R> p; + + auto tpg = std::make_shared<AsyncTopology>(); + + Future<R> fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t<Node::Async>{}, + [p=make_moc(std::move(p)), f=std::forward<F>(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v<R, void>) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + node->_name = name; + + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else{ + _schedule(node); + } + + return fu; +} + +// Function: async +template <typename F, typename... ArgsT> +auto Executor::async(F&& f, ArgsT&&... args) { + return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...); +} + +// Function: named_silent_async +template <typename F, typename... ArgsT> +void Executor::named_silent_async( + const std::string& name, F&& f, ArgsT&&... args +) { + + _increment_topology(); + + Node* node = node_pool.animate( + std::in_place_type_t<Node::SilentAsync>{}, + [f=std::forward<F>(f), args...] () mutable { + f(args...); + } + ); + + node->_name = name; + + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else { + _schedule(node); + } +} + +// Function: silent_async +template <typename F, typename... ArgsT> +void Executor::silent_async(F&& f, ArgsT&&... args) { + named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...); +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + + std::mutex mutex; + std::condition_variable cond; + size_t n=0; + + for(size_t id=0; id<N; ++id) { + + _workers[id]._id = id; + _workers[id]._vtm = id; + _workers[id]._executor = this; + _workers[id]._waiter = &_notifier._waiters[id]; + + _threads.emplace_back([this] ( + Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n + ) -> void { + + // enables the mapping + { + std::scoped_lock lock(mutex); + _wids[std::this_thread::get_id()] = w._id; + if(n++; n == num_workers()) { + cond.notify_one(); + } + } + + //this_worker().worker = &w; + + Node* t = nullptr; + + // must use 1 as condition instead of !done + while(1) { + + // execute the tasks. + _exploit_task(w, t); + + // wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } + } + + }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n)); + } + + std::unique_lock<std::mutex> lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +} + +// Function: _consume_task +inline void Executor::_consume_task(Worker& w, Node* p) { + + std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1); + + while(p->_join_counter != 0) { + exploit: + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + //size_t num_pauses = 0; + size_t max_steals = ((_workers.size() + 1) << 1); + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + if(t) { + _invoke(w, t); + goto exploit; + } + else if(p->_join_counter != 0){ + + if(num_steals++ > max_steals) { + std::this_thread::yield(); + } + + //std::this_thread::yield(); + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + size_t max_steals = ((_workers.size() + 1) << 1); + + std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1); + + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > max_steals) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } while(!_done); + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + + if(t) { + + if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) { + _notifier.notify(false); + } + + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } + + --_num_actives; + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + wait_for_task: + + //assert(!t); + + ++_num_thieves; + + explore_task: + + _explore_task(worker, t); + + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + + _notifier.prepare_wait(worker._waiter); + + //if(auto vtm = _find_vtm(me); vtm != _workers.size()) { + if(!_wsq.empty()) { + + _notifier.cancel_wait(worker._waiter); + //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal(); + + t = _wsq.steal(); // must steal here + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + else { + worker._vtm = worker._id; + goto explore_task; + } + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + --_num_thieves; + return false; + } + + if(_num_thieves.fetch_sub(1) == 1) { + if(_num_actives) { + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + // check all queues again + for(auto& w : _workers) { + if(!w._wsq.empty()) { + worker._vtm = w._id; + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + } + } + + // Now I really need to relinguish my self to others + _notifier.commit_wait(worker._waiter); + + return true; +} + +// Function: make_observer +template<typename Observer, typename... ArgsT> +std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v<ObserverInterface, Observer>, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template <typename Observer> +void Executor::remove_observer(std::shared_ptr<Observer> ptr) { + + static_assert( + std::is_base_of_v<ObserverInterface, Observer>, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool + if(worker._executor == this) { + worker._wsq.push(node); + return; + } + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + _wsq.push(node); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + _wsq.push(node); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule( + Worker& worker, const SmallVector<Node*>& nodes +) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // make the node ready + for(size_t i=0; i<num_nodes; ++i) { + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + } + + if(worker._executor == this) { + for(size_t i=0; i<num_nodes; ++i) { + worker._wsq.push(nodes[i]); + } + return; + } + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + for(size_t k=0; k<num_nodes; ++k) { + _wsq.push(nodes[k]); + } + } + + _notifier.notify_n(num_nodes); +} + +// Procedure: _schedule +inline void Executor::_schedule(const SmallVector<Node*>& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // make the node ready + for(size_t i=0; i<num_nodes; ++i) { + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + } + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + for(size_t k=0; k<num_nodes; ++k) { + _wsq.push(nodes[k]); + } + } + + _notifier.notify_n(num_nodes); +} + +// Procedure: _invoke +inline void Executor::_invoke(Worker& worker, Node* node) { + + int state; + SmallVector<int> conds; + + // synchronize all outstanding memory operations caused by reordering + do { + state = node->_state.load(std::memory_order_acquire); + } while(! (state & Node::READY)); + + // unwind stack for deferred node + if(state & Node::DEFERRED) { + node->_state.fetch_and(~Node::DEFERRED, std::memory_order_relaxed); + goto invoke_epilogue; + } + + //while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + + invoke_prologue: + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + _cancel_invoke(worker, node); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector<Node*> nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + //SmallVector<int> conds = { -1 }; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // dynamic task + case Node::DYNAMIC: { + _invoke_dynamic_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + bool deferred = false; + _invoke_module_task(worker, node, deferred); + if(deferred) { + return; + } + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // silent async task + case Node::SILENT_ASYNC: { + _invoke_silent_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // cudaflow task + case Node::CUDAFLOW: { + _invoke_cudaflow_task(worker, node); + } + break; + + // syclflow task + case Node::SYCLFLOW: { + _invoke_syclflow_task(worker, node); + } + break; + + // runtime task + case Node::RUNTIME: { + _invoke_runtime_task(worker, node); + } + break; + + // monostate (placeholder) + default: + break; + } + + invoke_epilogue: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // We MUST recover the dependency since the graph may have cycles. + // This must be done before scheduling the successors, otherwise this might cause + // race condition on the _dependents + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter = node->num_strong_dependents(); + } + else { + node->_join_counter = node->num_dependents(); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + Node* cache {nullptr}; + + // At this point, the node storage might be destructed (to be verified) + // case 1: non-condition task + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1); + if(cache) { + _schedule(worker, cache); + } + cache = s; + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i<node->_successors.size(); ++i) { + if(--(node->_successors[i]->_join_counter) == 0) { + j.fetch_add(1); + if(cache) { + _schedule(worker, cache); + } + cache = node->_successors[i]; + } + } + } + break; + } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(cache) { + node = cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto invoke_prologue; + } +} + +// Procedure: _tear_down_async +inline void Executor::_tear_down_async(Node* node) { + if(node->_parent) { + node->_parent->_join_counter.fetch_sub(1); + } + else { + _decrement_topology_and_notify(); + } + node_pool.recycle(node); +} + +// Proecdure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before substracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + else { + // prefetch the deferred status, as subtracting the join counter can + // immediately cause the other worker to release the subflow + auto deferred = parent->_state.load(std::memory_order_relaxed) & Node::DEFERRED; + if(parent->_join_counter.fetch_sub(1) == 1 && deferred) { + _schedule(worker, parent); + } + } +} + +// Procedure: _cancel_invoke +inline void Executor::_cancel_invoke(Worker& worker, Node* node) { + + switch(node->_handle.index()) { + // async task needs to carry out the promise + case Node::ASYNC: + std::get_if<Node::Async>(&(node->_handle))->work(true); + _tear_down_async(node); + break; + + // silent async doesn't need to carry out the promise + case Node::SILENT_ASYNC: + _tear_down_async(node); + break; + + // tear down topology if the node is the last leaf + default: { + _tear_down_invoke(worker, node); + } + break; + } +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if<Node::Static>(&node->_handle)->work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dynamic_task +inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) { + + _observer_prologue(w, node); + + auto handle = std::get_if<Node::Dynamic>(&node->_handle); + + handle->subgraph._clear(); + + Subflow sf(*this, w, node, handle->subgraph); + + handle->work(sf); + + if(sf._joinable) { + _invoke_dynamic_task_internal(w, node, handle->subgraph); + } + + _observer_epilogue(w, node); +} + +// Procedure: _invoke_dynamic_task_external +inline void Executor::_invoke_dynamic_task_external( + Worker& w, Node* p, Graph& g, bool detach +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter == 0) { + return; + } + + SmallVector<Node*> src; + + for(auto n : g._nodes) { + + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + + if(detach) { + n->_parent = nullptr; + n->_state.fetch_or(Node::DETACHED, std::memory_order_relaxed); + } + else { + n->_parent = p; + } + + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + + // detach here + if(detach) { + + { + std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size()); + _schedule(w, src); + } + // join here + else { + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); + _consume_task(w, p); + } +} + +// Procedure: _invoke_dynamic_task_internal +inline void Executor::_invoke_dynamic_task_internal( + Worker& w, Node* p, Graph& g +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter == 0) { + return; + } + + SmallVector<Node*> src; + + for(auto n : g._nodes) { + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + n->_parent = p; + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); + _consume_task(w, p); +} + +// Procedure: _invoke_module_task_internal +inline void Executor::_invoke_module_task_internal( + Worker& w, Node* p, Graph& g, bool& deferred +) { + + // graph is empty and has no async tasks + if(g.empty()) { + return; + } + + // set deferred + deferred = true; + p->_state.fetch_or(Node::DEFERRED, std::memory_order_relaxed); + + SmallVector<Node*> src; + + for(auto n : g._nodes) { + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + n->_parent = p; + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector<int>& conds +) { + _observer_prologue(worker, node); + conds = { std::get_if<Node::Condition>(&node->_handle)->work() }; + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector<int>& conds +) { + _observer_prologue(worker, node); + conds = std::get_if<Node::MultiCondition>(&node->_handle)->work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_cudaflow_task +inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if<Node::cudaFlow>(&node->_handle)->work(*this, node); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_syclflow_task +inline void Executor::_invoke_syclflow_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if<Node::syclFlow>(&node->_handle)->work(*this, node); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& deferred) { + _observer_prologue(w, node); + _invoke_module_task_internal( + w, node, std::get_if<Node::Module>(&node->_handle)->graph, deferred + ); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get_if<Node::Async>(&node->_handle)->work(false); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_silent_async_task +inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get_if<Node::SilentAsync>(&node->_handle)->work(); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_runtime_task +inline void Executor::_invoke_runtime_task(Worker& w, Node* node) { + _observer_prologue(w, node); + Runtime rt(*this, w, node); + std::get_if<Node::Runtime>(&node->_handle)->work(rt); + _observer_epilogue(w, node); +} + +// Function: run +inline tf::Future<void> Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future<void> Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template <typename C> +tf::Future<void> Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward<C>(c)); +} + +// Function: run +template <typename C> +tf::Future<void> Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward<C>(c)); +} + +// Function: run_n +inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template <typename C> +tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c) + ); +} + +// Function: run_n +template <typename C> +tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c) + ); +} + +// Function: run_until +template<typename P> +tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward<P>(pred), [](){}); +} + +// Function: run_until +template<typename P> +tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward<P>(pred), [](){}); +} + +// Function: run_until +template <typename P, typename C> +tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since dynamic task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard<std::mutex> lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise<void> promise; + promise.set_value(); + _decrement_topology_and_notify(); + return tf::Future<void>(promise.get_future(), std::monostate{}); + } + + // create a topology for this run + auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c)); + + // need to create future before the topology got torn down quickly + tf::Future<void> future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard<std::mutex> lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template <typename P, typename C> +tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list<Taskflow>::iterator itr; + + { + std::scoped_lock<std::mutex> lock(_taskflow_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward<P>(pred), std::forward<C>(c)); +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { + std::lock_guard<std::mutex> lock(_topology_mutex); + ++_num_topologies; +} + +// Procedure: _decrement_topology_and_notify +inline void Executor::_decrement_topology_and_notify() { + std::lock_guard<std::mutex> lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { + std::lock_guard<std::mutex> lock(_topology_mutex); + --_num_topologies; +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { + std::unique_lock<std::mutex> lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + + // scan each node in the graph and build up the links + for(auto node : tpg->_taskflow._graph._nodes) { + + node->_topology = tpg; + node->_state.store(0, std::memory_order_relaxed); + + if(node->num_dependents() == 0) { + tpg->_sources.push_back(node); + } + + node->_set_up_join_counter(); + } + + tpg->_join_counter = tpg->_sources.size(); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_is_cancelled && !tpg->_pred()) { + //assert(tpg->_join_counter == 0); + std::lock_guard<std::mutex> lock(f._mutex); + tpg->_join_counter = tpg->_sources.size(); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + // Need to back up the promise first here becuz taskflow might be + // destroy soon after calling get + auto p {std::move(tpg->_promise)}; + + // Back up lambda capture in case it has the topology pointer, + // to avoid it releasing on pop_front ahead of _mutex.unlock & + // _promise.set_value. Released safely when leaving scope. + auto c {std::move(tpg->_call)}; + + // Get the satellite if any + auto s {f._satellite}; + + // Now we remove the topology from this taskflow + f._topologies.pop(); + + //f._mutex.unlock(); + lock.unlock(); + + // We set the promise in the end in case taskflow leaves the scope. + // After set_value, the caller will return from wait + p.set_value(); + + _decrement_topology_and_notify(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(s) { + std::scoped_lock<std::mutex> lock(_taskflow_mutex); + _taskflows.erase(*s); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._invoke_dynamic_task_external(_worker, _parent, _graph, false); + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._invoke_dynamic_task_external(_worker, _parent, _graph, true); + _joinable = false; +} + +// Function: named_async +template <typename F, typename... ArgsT> +auto Subflow::named_async(const std::string& name, F&& f, ArgsT&&... args) { + return _named_async( + *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)... + ); +} + +// Function: _named_async +template <typename F, typename... ArgsT> +auto Subflow::_named_async( + Worker& w, + const std::string& name, + F&& f, + ArgsT&&... args +) { + + _parent->_join_counter.fetch_add(1); + + using T = std::invoke_result_t<F, ArgsT...>; + using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>; + + std::promise<R> p; + + auto tpg = std::make_shared<AsyncTopology>(); + + Future<R> fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t<Node::Async>{}, + [p=make_moc(std::move(p)), f=std::forward<F>(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v<R, void>) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + node->_name = name; + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template <typename F, typename... ArgsT> +auto Subflow::async(F&& f, ArgsT&&... args) { + return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...); +} + +// Function: _named_silent_async +template <typename F, typename... ArgsT> +void Subflow::_named_silent_async( + Worker& w, const std::string& name, F&& f, ArgsT&&... args +) { + + _parent->_join_counter.fetch_add(1); + + auto node = node_pool.animate( + std::in_place_type_t<Node::SilentAsync>{}, + [f=std::forward<F>(f), args...] () mutable { + f(args...); + } + ); + + node->_name = name; + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(w, node); +} + +// Function: silent_async +template <typename F, typename... ArgsT> +void Subflow::named_silent_async(const std::string& name, F&& f, ArgsT&&... args) { + _named_silent_async( + *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)... + ); +} + +// Function: named_silent_async +template <typename F, typename... ArgsT> +void Subflow::silent_async(F&& f, ArgsT&&... args) { + named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...); +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + auto node = task._node; + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1); + _executor._schedule(_worker, node); +} + +// Procedure: run +template <typename C> +void Runtime::run(C&& callable) { + + // dynamic task (subflow) + if constexpr(is_dynamic_task_v<C>) { + Graph graph; + Subflow sf(_executor, _worker, _parent, graph); + callable(sf); + if(sf._joinable) { + _executor._invoke_dynamic_task_internal(_worker, _parent, graph); + } + } + else { + static_assert(dependent_false_v<C>, "unsupported task callable to run"); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + + + diff --git a/myxpcs/include/taskflow_/core/executor.hpp b/myxpcs/include/taskflow_/core/executor.hpp new file mode 100644 index 0000000..2a549cc --- /dev/null +++ b/myxpcs/include/taskflow_/core/executor.hpp @@ -0,0 +1,2385 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" +#include "async_task.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future<void> fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + @param N the number of workers (default std::thread::hardware_concurrency) + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future<void> run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future<void> run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename C> + tf::Future<void> run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename C> + tf::Future<void> run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future<void> run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future<void> run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename C> + tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run_n( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename C> + tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename P> + tf::Future<void> run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run_until( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename P> + tf::Future<void> run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future<void> future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template<typename P, typename C> + tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future<void> future = executor.run_until( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template<typename P, typename C> + tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief runs a target graph and waits until it completes using + an internal worker of this executor + + @tparam T target type which has `tf::Graph& T::graph()` defined + @param target the target task graph object + + The method runs a target graph which has `tf::Graph& T::graph()` defined + and waits until the execution completes. + Unlike the typical flow of calling `tf::Executor::run` series + plus waiting on the result, this method must be called by an internal + worker of this executor. The caller worker will participate in + the work-stealing loop of the scheduler, therby avoiding potential + deadlock caused by blocked waiting. + + @code{.cpp} + tf::Executor executor(2); + tf::Taskflow taskflow; + std::array<tf::Taskflow, 1000> others; + + std::atomic<size_t> counter{0}; + + for(size_t n=0; n<1000; n++) { + for(size_t i=0; i<1000; i++) { + others[n].emplace([&](){ counter++; }); + } + taskflow.emplace([&executor, &tf=others[n]](){ + executor.corun(tf); + //executor.run(tf).wait(); <- blocking the worker without doing anything + // will introduce deadlock + }); + } + executor.run(taskflow).wait(); + @endcode + + The method is thread-safe as long as the target is not concurrently + ran by two or more threads. + + @attention + You must call tf::Executor::corun from a worker of the calling executor + or an exception will be thrown. + */ + template <typename T> + void corun(T& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @code{.cpp} + taskflow.emplace([&](){ + std::future<void> fu = std::async([](){ std::sleep(100s); }); + executor.corun_until([](){ + return fu.wait_for(std::chrono::seconds(0)) == future_status::ready; + }); + }); + @endcode + + @attention + You must call tf::Executor::corun_until from a worker of the calling executor + or an exception will be thrown. + */ + template <typename P> + void corun_until(P&& predicate); + + /** + @brief waits for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + // -------------------------------------------------------------------------- + // Observer methods + // -------------------------------------------------------------------------- + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template <typename Observer, typename... ArgsT> + std::shared_ptr<Observer> make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template <typename Observer> + void remove_observer(std::shared_ptr<Observer> observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + // -------------------------------------------------------------------------- + // Async Task Methods + // -------------------------------------------------------------------------- + + /** + @brief runs a given function asynchronously + + @tparam F callable type + + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates an asynchronous task to run the given function + and return a @std_future object that eventually will hold the result + of the return value. + + @code{.cpp} + std::future<int> future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template <typename F> + auto async(F&& func); + + /** + @brief runs a given function asynchronously and gives a name to this task + + @tparam F callable type + + @param name name of the asynchronous task + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates and assigns a name to an asynchronous task + to run the given function, + returning @std_future object that eventually will hold the result + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::future<int> future = executor.async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template <typename F> + auto async(const std::string& name, F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param func callable object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F> + void silent_async(F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param name assigned name to the task + @param func callable object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + executor.silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F> + void silent_async(const std::string& name, F&& func); + + // -------------------------------------------------------------------------- + // Silent Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief names and runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param name assigned name to the task + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, A, B + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + std::array<tf::AsyncTask, 2> array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, I first, I last); + + /** + @brief names and runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param name assigned name to the task + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array<tf::AsyncTask, 2> array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, I first, I last); + + // -------------------------------------------------------------------------- + // Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int> + that eventually will hold the result of the execution. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + fuC.get(); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr + > + auto dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief names and runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param name assigned name to the task + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int> + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template <typename F, typename... Tasks, + std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr + > + auto dependent_async(const std::string& name, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int> + that eventually will hold the result of the execution. + + @code{.cpp} + std::array<tf::AsyncTask, 2> array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr + > + auto dependent_async(F&& func, I first, I last); + + /** + @brief names and runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param name assigned name to the task + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int> + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array<tf::AsyncTask, 2> array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template <typename F, typename I, + std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr + > + auto dependent_async(const std::string& name, F&& func, I first, I last); + + private: + + const size_t _MAX_STEALS; + + std::mutex _wsq_mutex; + std::mutex _taskflows_mutex; + +#ifdef __cpp_lib_atomic_wait + std::atomic<size_t> _num_topologies {0}; + std::atomic_flag _all_spawned = ATOMIC_FLAG_INIT; +#else + std::condition_variable _topology_cv; + std::mutex _topology_mutex; + size_t _num_topologies {0}; +#endif + + std::unordered_map<std::thread::id, size_t> _wids; + std::vector<std::thread> _threads; + std::vector<Worker> _workers; + std::list<Taskflow> _taskflows; + + Notifier _notifier; + + TaskQueue<Node*> _wsq; + + std::atomic<bool> _done {0}; + + std::unordered_set<std::shared_ptr<ObserverInterface>> _observers; + + Worker* _this_worker(); + + Node* _tear_down_invoke(Worker&, Node*); + + bool _wait_for_task(Worker&, Node*&); + bool _invoke_module_task_internal(Worker&, Node*); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector<Node*>&); + void _schedule(const SmallVector<Node*>&); + void _set_up_topology(Worker*, Topology*); + void _set_up_graph(Graph&, Node*, Topology*, int, SmallVector<Node*>&); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_dependent_async(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_dynamic_task(Worker&, Node*); + void _consume_graph(Worker&, Node*, Graph&); + void _detach_dynamic_task(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector<int>&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&); + void _invoke_module_task(Worker&, Node*, bool&); + void _invoke_async_task(Worker&, Node*); + void _invoke_dependent_async_task(Worker&, Node*); + void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _process_exception(Worker&, Node*); + void _schedule_async_task(Node*); + + template <typename P> + void _corun_until(Worker&, P&&); +}; + +#ifdef TF_DISABLE_EXCEPTION_HANDLING + +#define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ + do { code_block; } while(0) +#else + +#define TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, code_block) \ + try { \ + code_block; \ + } catch(...) { \ + _process_exception(worker, node); \ + } +#endif + + +// Constructor +inline Executor::Executor(size_t N) : + _MAX_STEALS {((N+1) << 1)}, + _threads {N}, + _workers {N}, + _notifier {N} { + + if(N == 0) { + TF_THROW("no cpu workers to execute taskflows"); + } + + _spawn(N); + + // instantite the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer<TFProfObserver>()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + _done = true; + + _notifier.notify(true); + + for(auto& t : _threads){ + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { +#ifdef __cpp_lib_atomic_wait + return _num_topologies.load(std::memory_order_relaxed); +#else + return _num_topologies; +#endif +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + +#ifdef __cpp_lib_atomic_wait +#else + std::mutex mutex; + std::condition_variable cond; + size_t n=0; +#endif + + for(size_t id=0; id<N; ++id) { + + _workers[id]._id = id; + _workers[id]._vtm = id; + _workers[id]._executor = this; + _workers[id]._waiter = &_notifier._waiters[id]; + + _threads[id] = std::thread([&, &w=_workers[id]] () { + +#ifdef __cpp_lib_atomic_wait + // wait for the caller thread to initialize the ID mapping + _all_spawned.wait(false, std::memory_order_acquire); + w._thread = &_threads[w._id]; +#else + // update the ID mapping of this thread + w._thread = &_threads[w._id]; + { + std::scoped_lock lock(mutex); + _wids[std::this_thread::get_id()] = w._id; + if(n++; n == num_workers()) { + cond.notify_one(); + } + } +#endif + + Node* t = nullptr; + + while(1) { + + // execute the tasks. + _exploit_task(w, t); + + // wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } + } + + }); + + // POSIX-like system can use the following to affine threads to cores + //cpu_set_t cpuset; + //CPU_ZERO(&cpuset); + //CPU_SET(id, &cpuset); + //pthread_setaffinity_np( + // _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset + //); + +#ifdef __cpp_lib_atomic_wait + //_wids[_threads[id].get_id()] = id; + _wids.emplace(std::piecewise_construct, + std::forward_as_tuple(_threads[id].get_id()), std::forward_as_tuple(id) + ); +#endif + } + +#ifdef __cpp_lib_atomic_wait + _all_spawned.test_and_set(std::memory_order_release); + _all_spawned.notify_all(); +#else + std::unique_lock<std::mutex> lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +#endif +} + +// Function: _corun_until +template <typename P> +void Executor::_corun_until(Worker& w, P&& stop_predicate) { + + std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1); + + exploit: + + while(!stop_predicate()) { + + //exploit: + + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + _invoke(w, t); + goto exploit; + } + else if(!stop_predicate()) { + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + } + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + + std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1); + + // Here, we write do-while to make the worker steal at once + // from the assigned victim. + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } while(!_done); + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + explore_task: + + _explore_task(worker, t); + + // The last thief who successfully stole a task will wake up + // another thief worker to avoid starvation. + if(t) { + _notifier.notify(false); + return true; + } + + // ---- 2PC guard ---- + _notifier.prepare_wait(worker._waiter); + + if(!_wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = worker._id; + goto explore_task; + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + return false; + } + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { + _notifier.cancel_wait(worker._waiter); + worker._vtm = vtm; + goto explore_task; + } + } + + // Now I really need to relinguish my self to others + _notifier.commit_wait(worker._waiter); + + goto explore_task; +} + +// Function: make_observer +template<typename Observer, typename... ArgsT> +std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v<ObserverInterface, Observer>, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template <typename Observer> +void Executor::remove_observer(std::shared_ptr<Observer> ptr) { + + static_assert( + std::is_base_of_v<ObserverInterface, Observer>, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + worker._wsq.push(node, p); + _notifier.notify(false); + return; + } + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + _wsq.push(node, p); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + _wsq.push(node, p); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + for(size_t i=0; i<num_nodes; ++i) { + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = nodes[i]->_priority; + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + worker._wsq.push(nodes[i], p); + _notifier.notify(false); + } + return; + } + + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + for(size_t k=0; k<num_nodes; ++k) { + auto p = nodes[k]->_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + + _notifier.notify_n(num_nodes); +} + +// Procedure: _schedule +inline void Executor::_schedule(const SmallVector<Node*>& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + { + std::lock_guard<std::mutex> lock(_wsq_mutex); + for(size_t k=0; k<num_nodes; ++k) { + auto p = nodes[k]->_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + + _notifier.notify_n(num_nodes); +} + +// Procedure: _invoke +inline void Executor::_invoke(Worker& worker, Node* node) { + + // synchronize all outstanding memory operations caused by reordering + while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + + begin_invoke: + + SmallVector<int> conds; + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + if(node = _tear_down_invoke(worker, node); node) { + goto invoke_successors; + } + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector<Node*> nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // dynamic task + case Node::DYNAMIC: { + _invoke_dynamic_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + bool spawned; + _invoke_module_task(worker, node, spawned); + if(spawned) { + return; + } + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // dependent async task + case Node::DEPENDENT_ASYNC: { + _invoke_dependent_async_task(worker, node); + _tear_down_dependent_async(worker, node); + if(worker._cache) { + node = worker._cache; + goto begin_invoke; + } + return; + } + break; + + // monostate (placeholder) + default: + break; + } + + invoke_successors: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // Reset the join counter to support the cyclic control flow. + // + We must do this before scheduling the successors to avoid race + // condition on _dependents. + // + We must use fetch_add instead of direct assigning + // because the user-space call on "invoke" may explicitly schedule + // this task again (e.g., pipeline) which can access the join_counter. + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); + } + else { + node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + // Here, we want to cache the latest successor with the highest priority + worker._cache = nullptr; + auto max_p = static_cast<unsigned>(TaskPriority::MAX); + + // Invoke the task based on the corresponding type + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i<node->_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } + break; + } + + // tear_down the invoke + if(node = _tear_down_invoke(worker, node); node) { + goto invoke_successors; + } + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(worker._cache) { + node = worker._cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto begin_invoke; + } +} + +// Proecdure: _tear_down_invoke +inline Node* Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before substracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + // module task + else { + auto id = parent->_handle.index(); + if(parent->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + if(id == Node::MODULE) { + return parent; + } + } + } + return nullptr; +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _process_exception +inline void Executor::_process_exception(Worker&, Node* node) { + + constexpr static auto flag = Topology::EXCEPTION | Topology::CANCELLED; + + // multiple tasks may throw, so we only take the first thrown exception + if(auto tpg = node->_topology; tpg && + ((tpg->_state.fetch_or(flag, std::memory_order_relaxed) & Topology::EXCEPTION) == 0) + ) { + tpg->_exception = std::current_exception(); + } + // TODO: skip the exception that is not associated with any taskflows +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if<Node::Static>(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dynamic_task +inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) { + + _observer_prologue(w, node); + + auto handle = std::get_if<Node::Dynamic>(&node->_handle); + + handle->subgraph._clear(); + + Subflow sf(*this, w, node, handle->subgraph); + + TF_EXECUTOR_EXCEPTION_HANDLER(w, node, { + handle->work(sf); + }); + + if(sf._joinable) { + _consume_graph(w, node, handle->subgraph); + } + + _observer_epilogue(w, node); +} + +// Procedure: _detach_dynamic_task +inline void Executor::_detach_dynamic_task(Worker& w, Node* p, Graph& g) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector<Node*> src; + _set_up_graph(g, nullptr, p->_topology, Node::DETACHED, src); + + { + std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + _schedule(w, src); +} + +// Procedure: _consume_graph +inline void Executor::_consume_graph(Worker& w, Node* p, Graph& g) { + + // graph is empty and has no async tasks (subflow) + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector<Node*> src; + + _set_up_graph(g, p, p->_topology, 0, src); + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + + _schedule(w, src); + + _corun_until(w, [p] () -> bool { + return p->_join_counter.load(std::memory_order_acquire) == 0; } + ); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector<int>& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if<Node::Condition>(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = { std::get_if<0>(&work)->operator()() }; + break; + + case 1: + Runtime rt(*this, worker, node); + conds = { std::get_if<1>(&work)->operator()(rt) }; + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector<int>& conds +) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if<Node::MultiCondition>(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + conds = std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& spawned) { + _observer_prologue(w, node); + spawned = _invoke_module_task_internal(w, node); + _observer_epilogue(w, node); +} + +// Function: _invoke_module_task_internal +inline bool Executor::_invoke_module_task_internal(Worker& w, Node* p) { + + // acquire the underlying graph + auto& g = std::get_if<Node::Module>(&p->_handle)->graph; + + // no need to do anything if the graph is empty + if(g.empty()) { + return false; + } + + SmallVector<Node*> src; + _set_up_graph(g, p, p->_topology, 0, src); + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + + _schedule(w, src); + return true; +} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if<Node::Async>(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dependent_async_task +inline void Executor::_invoke_dependent_async_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + TF_EXECUTOR_EXCEPTION_HANDLER(worker, node, { + auto& work = std::get_if<Node::DependentAsync>(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } + }); + _observer_epilogue(worker, node); +} + +// Function: run +inline tf::Future<void> Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future<void> Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template <typename C> +tf::Future<void> Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward<C>(c)); +} + +// Function: run +template <typename C> +tf::Future<void> Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward<C>(c)); +} + +// Function: run_n +inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template <typename C> +tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c) + ); +} + +// Function: run_n +template <typename C> +tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c) + ); +} + +// Function: run_until +template<typename P> +tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward<P>(pred), [](){}); +} + +// Function: run_until +template<typename P> +tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward<P>(pred), [](){}); +} + +// Function: run_until +template <typename P, typename C> +tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since dynamic task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard<std::mutex> lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise<void> promise; + promise.set_value(); + _decrement_topology(); + return tf::Future<void>(promise.get_future()); + } + + // create a topology for this run + auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c)); + + // need to create future before the topology got torn down quickly + tf::Future<void> future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard<std::mutex> lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template <typename P, typename C> +tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list<Taskflow>::iterator itr; + + { + std::scoped_lock<std::mutex> lock(_taskflows_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward<P>(pred), std::forward<C>(c)); +} + +// Function: corun +template <typename T> +void Executor::corun(T& target) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun must be called by a worker of the executor"); + } + + Node parent; // dummy parent + _consume_graph(*w, &parent, target.graph()); +} + +// Function: corun_until +template <typename P> +void Executor::corun_until(P&& predicate) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun_until must be called by a worker of the executor"); + } + + _corun_until(*w, std::forward<P>(predicate)); +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { +#ifdef __cpp_lib_atomic_wait + _num_topologies.fetch_add(1, std::memory_order_relaxed); +#else + std::lock_guard<std::mutex> lock(_topology_mutex); + ++_num_topologies; +#endif +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { +#ifdef __cpp_lib_atomic_wait + if(_num_topologies.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _num_topologies.notify_all(); + } +#else + std::lock_guard<std::mutex> lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +#endif +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { +#ifdef __cpp_lib_atomic_wait + size_t n = _num_topologies.load(std::memory_order_acquire); + while(n != 0) { + _num_topologies.wait(n, std::memory_order_acquire); + n = _num_topologies.load(std::memory_order_acquire); + } +#else + std::unique_lock<std::mutex> lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +#endif +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + _set_up_graph(tpg->_taskflow._graph, nullptr, tpg, 0, tpg->_sources); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _set_up_graph +inline void Executor::_set_up_graph( + Graph& g, Node* parent, Topology* tpg, int state, SmallVector<Node*>& src +) { + for(auto node : g._nodes) { + node->_topology = tpg; + node->_parent = parent; + node->_state.store(state, std::memory_order_relaxed); + if(node->num_dependents() == 0) { + src.push_back(node); + } + node->_set_up_join_counter(); + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_exception && + !(tpg->_state.load(std::memory_order_relaxed) & Topology::CANCELLED) && + !tpg->_pred() + ) { + //assert(tpg->_join_counter == 0); + std::lock_guard<std::mutex> lock(f._mutex); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + auto fetched_tpg {std::move(f._topologies.front())}; + f._topologies.pop(); + auto satellite {f._satellite}; + + lock.unlock(); + + // Soon after we carry out the promise, there is no longer any guarantee + // for the lifetime of the associated taskflow. + fetched_tpg->_carry_out_promise(); + + _decrement_topology(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(satellite) { + std::scoped_lock<std::mutex> satellite_lock(_taskflows_mutex); + _taskflows.erase(*satellite); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._consume_graph(_worker, _parent, _graph); + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._detach_dynamic_task(_worker, _parent, _graph); + _joinable = false; +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} + +// Procedure: corun +template <typename T> +void Runtime::corun(T&& target) { + + // dynamic task (subflow) + if constexpr(is_dynamic_task_v<T>) { + Graph graph; + Subflow sf(_executor, _worker, _parent, graph); + target(sf); + if(sf._joinable) { + _executor._consume_graph(_worker, _parent, graph); + } + } + // a composable graph object with `tf::Graph& T::graph()` defined + else { + _executor._consume_graph(_worker, _parent, target.graph()); + } +} + +// Procedure: corun_until +template <typename P> +void Runtime::corun_until(P&& predicate) { + _executor._corun_until(_worker, std::forward<P>(predicate)); +} + +// Function: _silent_async +template <typename F> +void Runtime::_silent_async(Worker& w, const std::string& name, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + auto node = node_pool.animate( + name, 0, _parent->_topology, _parent, 0, + std::in_place_type_t<Node::Async>{}, std::forward<F>(f) + ); + + _executor._schedule(w, node); +} + +// Function: silent_async +template <typename F> +void Runtime::silent_async(F&& f) { + _silent_async(*_executor._this_worker(), "", std::forward<F>(f)); +} + +// Function: silent_async +template <typename F> +void Runtime::silent_async(const std::string& name, F&& f) { + _silent_async(*_executor._this_worker(), name, std::forward<F>(f)); +} + +// Function: silent_async_unchecked +template <typename F> +void Runtime::silent_async_unchecked(const std::string& name, F&& f) { + _silent_async(_worker, name, std::forward<F>(f)); +} + +// Function: _async +template <typename F> +auto Runtime::_async(Worker& w, const std::string& name, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + using R = std::invoke_result_t<std::decay_t<F>>; + + std::packaged_task<R()> p(std::forward<F>(f)); + auto fu{p.get_future()}; + + auto node = node_pool.animate( + name, 0, _parent->_topology, _parent, 0, std::in_place_type_t<Node::Async>{}, + [p=make_moc(std::move(p))] () mutable { p.object(); } + ); + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template <typename F> +auto Runtime::async(F&& f) { + return _async(*_executor._this_worker(), "", std::forward<F>(f)); +} + +// Function: async +template <typename F> +auto Runtime::async(const std::string& name, F&& f) { + return _async(*_executor._this_worker(), name, std::forward<F>(f)); +} + +// Function: corun_all +inline void Runtime::corun_all() { + corun_until([this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); +} + +// Destructor +inline Runtime::~Runtime() { + if(_parent->_join_counter.load(std::memory_order_acquire)) { + corun_all(); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/myxpcs/include/taskflow_/core/flow_builder.hpp b/myxpcs/include/taskflow_/core/flow_builder.hpp new file mode 100644 index 0000000..f4259dc --- /dev/null +++ b/myxpcs/include/taskflow_/core/flow_builder.hpp @@ -0,0 +1,1399 @@ +#pragma once + +#include "task.hpp" +#include "../algorithm/partitioner.hpp" + +/** +@file flow_builder.hpp +@brief flow builder include file +*/ + +namespace tf { + +/** +@class FlowBuilder + +@brief class to build a task dependency graph + +The class provides essential methods to construct a task dependency graph +from which tf::Taskflow and tf::Subflow are derived. + +*/ +class FlowBuilder { + + friend class Executor; + + public: + + /** + @brief constructs a flow builder with a graph + */ + FlowBuilder(Graph& graph); + + /** + @brief creates a static task + + @tparam C callable type constructible from std::function<void()> + + @param callable callable to construct a static task + + @return a tf::Task handle + + The following example creates a static task. + + @code{.cpp} + tf::Task static_task = taskflow.emplace([](){}); + @endcode + + Please refer to @ref StaticTasking for details. + */ + template <typename C, + std::enable_if_t<is_static_task_v<C>, void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a dynamic task + + @tparam C callable type constructible from std::function<void(tf::Subflow&)> + + @param callable callable to construct a dynamic task + + @return a tf::Task handle + + The following example creates a dynamic task (tf::Subflow) + that spawns two static tasks. + + @code{.cpp} + tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){ + tf::Task static_task1 = sf.emplace([](){}); + tf::Task static_task2 = sf.emplace([](){}); + }); + @endcode + + Please refer to @ref DynamicTasking for details. + */ + template <typename C, + std::enable_if_t<is_dynamic_task_v<C>, void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a condition task + + @tparam C callable type constructible from std::function<int()> + + @param callable callable to construct a condition task + + @return a tf::Task handle + + The following example creates an if-else block using one condition task + and three static tasks. + + @code{.cpp} + tf::Taskflow taskflow; + + auto [init, cond, yes, no] = taskflow.emplace( + [] () { }, + [] () { return 0; }, + [] () { std::cout << "yes\n"; }, + [] () { std::cout << "no\n"; } + ); + + // executes yes if cond returns 0, or no if cond returns 1 + cond.precede(yes, no); + cond.succeed(init); + @endcode + + Please refer to @ref ConditionalTasking for details. + */ + template <typename C, + std::enable_if_t<is_condition_task_v<C>, void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a multi-condition task + + @tparam C callable type constructible from + std::function<tf::SmallVector<int>()> + + @param callable callable to construct a multi-condition task + + @return a tf::Task handle + + The following example creates a multi-condition task that selectively + jumps to two successor tasks. + + @code{.cpp} + tf::Taskflow taskflow; + + auto [init, cond, branch1, branch2, branch3] = taskflow.emplace( + [] () { }, + [] () { return tf::SmallVector{0, 2}; }, + [] () { std::cout << "branch1\n"; }, + [] () { std::cout << "branch2\n"; }, + [] () { std::cout << "branch3\n"; } + ); + + // executes branch1 and branch3 when cond returns 0 and 2 + cond.precede(branch1, branch2, branch3); + cond.succeed(init); + @endcode + + Please refer to @ref ConditionalTasking for details. + */ + template <typename C, + std::enable_if_t<is_multi_condition_task_v<C>, void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates multiple tasks from a list of callable objects + + @tparam C callable types + + @param callables one or multiple callable objects constructible from each task category + + @return a tf::Task handle + + The method returns a tuple of tasks each corresponding to the given + callable target. You can use structured binding to get the return tasks + one by one. + The following example creates four static tasks and assign them to + @c A, @c B, @c C, and @c D using structured binding. + + @code{.cpp} + auto [A, B, C, D] = taskflow.emplace( + [] () { std::cout << "A"; }, + [] () { std::cout << "B"; }, + [] () { std::cout << "C"; }, + [] () { std::cout << "D"; } + ); + @endcode + */ + template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>* = nullptr> + auto emplace(C&&... callables); + + /** + @brief removes a task from a taskflow + + @param task task to remove + + Removes a task and its input and output dependencies from the graph + associated with the flow builder. + If the task does not belong to the graph, nothing will happen. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + A.precede(B, C, D); + + // erase A from the taskflow and its dependencies to B, C, and D + taskflow.erase(A); + @endcode + */ + void erase(Task task); + + /** + @brief creates a module task for the target object + + @tparam T target object type + @param object a custom object that defines the method @c T::graph() + + @return a tf::Task handle + + The example below demonstrates a taskflow composition using + the @c composed_of method. + + @code{.cpp} + tf::Taskflow t1, t2; + t1.emplace([](){ std::cout << "t1"; }); + + // t2 is partially composed of t1 + tf::Task comp = t2.composed_of(t1); + tf::Task init = t2.emplace([](){ std::cout << "t2"; }); + init.precede(comp); + @endcode + + The taskflow object @c t2 is composed of another taskflow object @c t1, + preceded by another static task @c init. + When taskflow @c t2 is submitted to an executor, + @c init will run first and then @c comp which spwans its definition + in taskflow @c t1. + + The target @c object being composed must define the method + <tt>T::graph()</tt> that returns a reference to a graph object of + type tf::Graph such that it can interact with the executor. + For example: + + @code{.cpp} + // custom struct + struct MyObj { + tf::Graph graph; + MyObj() { + tf::FlowBuilder builder(graph); + tf::Task task = builder.emplace([](){ + std::cout << "a task\n"; // static task + }); + } + Graph& graph() { return graph; } + }; + + MyObj obj; + tf::Task comp = taskflow.composed_of(obj); + @endcode + + Please refer to @ref ComposableTasking for details. + */ + template <typename T> + Task composed_of(T& object); + + /** + @brief creates a placeholder task + + @return a tf::Task handle + + A placeholder task maps to a node in the taskflow graph, but + it does not have any callable work assigned yet. + A placeholder task is different from an empty task handle that + does not point to any node in a graph. + + @code{.cpp} + // create a placeholder task with no callable target assigned + tf::Task placeholder = taskflow.placeholder(); + assert(placeholder.empty() == false && placeholder.has_work() == false); + + // create an empty task handle + tf::Task task; + assert(task.empty() == true); + + // assign the task handle to the placeholder task + task = placeholder; + assert(task.empty() == false && task.has_work() == false); + @endcode + */ + Task placeholder(); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks a vector of tasks + + This member function creates linear dependencies over a vector of tasks. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + std::vector<tf::Task> tasks {A, B, C, D} + taskflow.linearize(tasks); // A->B->C->D + @endcode + + */ + void linearize(std::vector<Task>& tasks); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks an initializer list of tasks + + This member function creates linear dependencies over a list of tasks. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + taskflow.linearize({A, B, C, D}); // A->B->C->D + @endcode + */ + void linearize(std::initializer_list<Task> tasks); + + // ------------------------------------------------------------------------ + // parallel iterations + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel-for task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable callable object to apply to the dereferenced iterator + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to each object + obtained by dereferencing every iterator in the range <tt>[first, last)</tt>. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + callable(*itr); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take a single argument of + the dereferenced iterator type. + + Please refer to @ref ParallelIterations for details. + */ + template <typename B, typename E, typename C, typename P = GuidedPartitioner> + Task for_each(B first, E last, C callable, P&& part = P()); + + /** + @brief constructs an STL-styled index-based parallel-for task + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first index of the beginning (inclusive) + @param last index of the end (exclusive) + @param step step size + @param callable callable object to apply to each valid index + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to each index + in the range <tt>[first, last)</tt> with the step size. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + // case 1: step size is positive + for(auto i=first; i<last; i+=step) { + callable(i); + } + + // case 2: step size is negative + for(auto i=first, i>last; i+=step) { + callable(i); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. + + Please refer to @ref ParallelIterations for details. + */ + template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner> + Task for_each_index( + B first, E last, S step, C callable, P&& part = P() + ); + + // ------------------------------------------------------------------------ + // transform + // ------------------------------------------------------------------------ + + /** + @brief constructs a parallel-transform task + + @tparam B beginning input iterator type + @tparam E ending input iterator type + @tparam O output iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first1 iterator to the beginning of the first range + @param last1 iterator to the end of the first range + @param d_first iterator to the beginning of the output range + @param c an unary callable to apply to dereferenced input elements + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to an + input range and stores the result in another output range. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + while (first1 != last1) { + *d_first++ = c(*first1++); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take a single argument of the dereferenced + iterator type. + + Please refer to @ref ParallelTransforms for details. + */ + template < + typename B, typename E, typename O, typename C, typename P = GuidedPartitioner, + std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr + > + Task transform(B first1, E last1, O d_first, C c, P&& part = P()); + + /** + @brief constructs a parallel-transform task + + @tparam B1 beginning input iterator type for the first input range + @tparam E1 ending input iterator type for the first input range + @tparam B2 beginning input iterator type for the first second range + @tparam O output iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first1 iterator to the beginning of the first input range + @param last1 iterator to the end of the first input range + @param first2 iterator to the beginning of the second input range + @param d_first iterator to the beginning of the output range + @param c a binary operator to apply to dereferenced input elements + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to two + input ranges and stores the result in another output range. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + while (first1 != last1) { + *d_first++ = c(*first1++, *first2++); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take two arguments of dereferenced elements + from the two input ranges. + + Please refer to @ref ParallelTransforms for details. + */ + template < + typename B1, typename E1, typename B2, typename O, typename C, typename P=GuidedPartitioner, + std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr + > + Task transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P()); + + // ------------------------------------------------------------------------ + // reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over @c init + and the elements in the range <tt>[first, last)</tt>. + The reduced result is store in @c init. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, *itr); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner> + Task reduce(B first, E last, T& init, O bop, P&& part = P()); + + // ------------------------------------------------------------------------ + // transfrom and reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel transform-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over @c init and + the transformed elements in the range <tt>[first, last)</tt>. + The reduced result is store in @c init. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, uop(*itr)); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template < + typename B, typename E, typename T, typename BOP, typename UOP, typename P = GuidedPartitioner, + std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr + > + Task transform_reduce(B first, E last, T& init, BOP bop, UOP uop, P&& part = P()); + + /** + @brief constructs an STL-styled parallel transform-reduce task + @tparam B1 first beginning iterator type + @tparam E1 first ending iterator type + @tparam B2 second beginning iterator type + @tparam T result type + @tparam BOP_R binary reducer type + @tparam BOP_T binary transformion type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first1 iterator to the beginning of the first range (inclusive) + @param last1 iterator to the end of the first range (exclusive) + @param first2 iterator to the beginning of the second range + @param init initial value of the reduction and the storage for the reduced result + @param bop_r binary operator that will be applied in unspecified order to the results of @c bop_t + @param bop_t binary operator that will be applied to transform each element in the range to the result type + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over @c init and + the transformed elements in the range <tt>[first, last)</tt>. + The reduced result is store in @c init. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr1=first1, itr2=first2; itr1!=last1; itr1++, itr2++) { + init = bop_r(init, bop_t(*itr1, *itr2)); + } + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + + template < + typename B1, typename E1, typename B2, typename T, typename BOP_R, typename BOP_T, + typename P = GuidedPartitioner, + std::enable_if_t<!is_partitioner_v<std::decay_t<BOP_T>>, void>* = nullptr + > + Task transform_reduce( + B1 first1, E1 last1, B2 first2, T& init, BOP_R bop_r, BOP_T bop_t, P&& part = P() + ); + + // ------------------------------------------------------------------------ + // scan + // ------------------------------------------------------------------------ + + /** + @brief creates an STL-styled parallel inclusive-scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements using the given binary operator + for summation. + + This function generates an @em inclusive scan, meaning that the N-th element + of the output range is the sum of the first N input elements, + so the N-th input element is included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus<int>{} + ); + executor.run(taskflow).wait(); + + // input is {1, 3, 6, 10, 15} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template <typename B, typename E, typename D, typename BOP> + Task inclusive_scan(B first, E last, D d_first, BOP bop); + + /** + @brief creates an STL-styled parallel inclusive-scan task with an initial value + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param init initial value + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements (and the initial value) + using the given binary operator for summation. + + This function generates an @em inclusive scan, meaning the N-th element + of the output range is the sum of the first N input elements, + so the N-th input element is included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus<int>{}, -1 + ); + executor.run(taskflow).wait(); + + // input is {0, 2, 5, 9, 14} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + + */ + template <typename B, typename E, typename D, typename BOP, typename T> + Task inclusive_scan(B first, E last, D d_first, BOP bop, T init); + + /** + @brief creates an STL-styled parallel exclusive-scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam T initial value type + @tparam BOP summation operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param init initial value + @param bop function to perform summation + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements (and the initial value) + using the given binary operator for summation. + + This function generates an @em exclusive scan, meaning the N-th element + of the output range is the sum of the first N-1 input elements, + so the N-th input element is not included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.exclusive_scan( + input.begin(), input.end(), input.begin(), -1, std::plus<int>{} + ); + executor.run(taskflow).wait(); + + // input is {-1, 0, 2, 5, 9} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template <typename B, typename E, typename D, typename T, typename BOP> + Task exclusive_scan(B first, E last, D d_first, T init, BOP bop); + + // ------------------------------------------------------------------------ + // transform scan + // ------------------------------------------------------------------------ + + /** + @brief creates an STL-styled parallel transform-inclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em inclusive scan, meaning the Nth element + of the output range is the sum of the first N input elements, + so the Nth input element is included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.transform_inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus<int>{}, + [] (int item) { return -item; } + ); + executor.run(taskflow).wait(); + + // input is {-1, -3, -6, -10, -15} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template <typename B, typename E, typename D, typename BOP, typename UOP> + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop); + + /** + @brief creates an STL-styled parallel transform-inclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + @param init initial value + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements (including an initial value) + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em inclusive scan, meaning the Nth element + of the output range is the sum of the first N input elements, + so the Nth input element is included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.transform_inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus<int>{}, + [] (int item) { return -item; }, + -1 + ); + executor.run(taskflow).wait(); + + // input is {-2, -4, -7, -11, -16} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template <typename B, typename E, typename D, typename BOP, typename UOP, typename T> + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init); + + /** + @brief creates an STL-styled parallel transform-exclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + @param init initial value + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements (including an initial value) + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em exclusive scan, meaning the Nth element + of the output range is the sum of the first N-1 input elements, + so the Nth input element is not included. + + @code{.cpp} + std::vector<int> input = {1, 2, 3, 4, 5}; + taskflow.transform_exclusive_scan( + input.begin(), input.end(), input.begin(), -1, std::plus<int>{}, + [](int item) { return -item; } + ); + executor.run(taskflow).wait(); + + // input is {-1, -2, -4, -7, -11} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template <typename B, typename E, typename D, typename T, typename BOP, typename UOP> + Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop); + + // ------------------------------------------------------------------------ + // find + // ------------------------------------------------------------------------ + + /** + @brief constructs a task to perform STL-styled find-if algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam UOP unary predicate type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param predicate unary predicate which returns @c true for the required element + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Returns an iterator to the first element in the range <tt>[first, last)</tt> + that satisfies the given criteria (or last if there is no such iterator). + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + auto find_if(InputIt first, InputIt last, UnaryPredicate p) { + for (; first != last; ++first) { + if (predicate(*first)){ + return first; + } + } + return last; + } + @endcode + + For example, the code below find the element that satisfies the given + criteria (value plus one is equal to 23) from an input range of 10 elements: + + @code{.cpp} + std::vector<int> input = {1, 6, 9, 10, 22, 5, 7, 8, 9, 11}; + std::vector<int>::iterator result; + taskflow.find_if( + input.begin(), input.end(), [](int i){ return i+1 = 23; }, result + ); + executor.run(taskflow).wait(); + assert(*result == 22); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner> + Task find_if(B first, E last, T& result, UOP predicate, P&& part = P()); + + /** + @brief constructs a task to perform STL-styled find-if-not algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam UOP unary predicate type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param predicate unary predicate which returns @c false for the required element + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Returns an iterator to the first element in the range <tt>[first, last)</tt> + that satisfies the given criteria (or last if there is no such iterator). + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + auto find_if(InputIt first, InputIt last, UnaryPredicate p) { + for (; first != last; ++first) { + if (!predicate(*first)){ + return first; + } + } + return last; + } + @endcode + + For example, the code below find the element that satisfies the given + criteria (value is not equal to 1) from an input range of 10 elements: + + @code{.cpp} + std::vector<int> input = {1, 1, 1, 1, 22, 1, 1, 1, 1, 1}; + std::vector<int>::iterator result; + taskflow.find_if_not( + input.begin(), input.end(), [](int i){ return i == 1; }, result + ); + executor.run(taskflow).wait(); + assert(*result == 22); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template <typename B, typename E, typename T, typename UOP,typename P = GuidedPartitioner> + Task find_if_not(B first, E last, T& result, UOP predicate, P&& part = P()); + + /** + @brief constructs a task to perform STL-styled min-element algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam C comparator type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param comp comparison function object + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Finds the smallest element in the <tt>[first, last)</tt> + using the given comparison function object. + The iterator to that smallest element is stored in @c result. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + if (first == last) { + return last; + } + auto smallest = first; + ++first; + for (; first != last; ++first) { + if (comp(*first, *smallest)) { + smallest = first; + } + } + return smallest; + @endcode + + For example, the code below find the smallest element from an input + range of 10 elements. + + @code{.cpp} + std::vector<int> input = {1, 1, 1, 1, 1, -1, 1, 1, 1, 1}; + std::vector<int>::iterator result; + taskflow.min_element( + input.begin(), input.end(), std::less<int>(), result + ); + executor.run(taskflow).wait(); + assert(*result == -1); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template <typename B, typename E, typename T, typename C, typename P> + Task min_element(B first, E last, T& result, C comp, P&& part); + + /** + @brief constructs a task to perform STL-styled max-element algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam C comparator type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param comp comparison function object + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Finds the largest element in the <tt>[first, last)</tt> + using the given comparison function object. + The iterator to that largest element is stored in @c result. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + if (first == last){ + return last; + } + auto largest = first; + ++first; + for (; first != last; ++first) { + if (comp(*largest, *first)) { + largest = first; + } + } + return largest; + @endcode + + For example, the code below find the largest element from an input + range of 10 elements. + + @code{.cpp} + std::vector<int> input = {1, 1, 1, 1, 1, 2, 1, 1, 1, 1}; + std::vector<int>::iterator result; + taskflow.max_element( + input.begin(), input.end(), std::less<int>(), result + ); + executor.run(taskflow).wait(); + assert(*result == 2); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template <typename B, typename E, typename T, typename C, typename P> + Task max_element(B first, E last, T& result, C comp, P&& part); + + // ------------------------------------------------------------------------ + // sort + // ------------------------------------------------------------------------ + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + @tparam C comparator type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param cmp comparison operator + + The task spawns asynchronous tasks to sort elements in the range + <tt>[first, last)</tt> in parallel. + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template <typename B, typename E, typename C> + Task sort(B first, E last, C cmp); + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort using + the @c std::less<T> comparator, where @c T is the element type + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + The task spawns asynchronous tasks to parallelly sort elements in the range + <tt>[first, last)</tt> using the @c std::less<T> comparator, + where @c T is the dereferenced iterator type. + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template <typename B, typename E> + Task sort(B first, E last); + + protected: + + /** + @brief associated graph object + */ + Graph& _graph; + + private: + + template <typename L> + void _linearize(L&); +}; + +// Constructor +inline FlowBuilder::FlowBuilder(Graph& graph) : + _graph {graph} { +} + +// Function: emplace +template <typename C, std::enable_if_t<is_static_task_v<C>, void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::Static>{}, std::forward<C>(c) + )); +} + +// Function: emplace +template <typename C, std::enable_if_t<is_dynamic_task_v<C>, void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::Dynamic>{}, std::forward<C>(c) + )); +} + +// Function: emplace +template <typename C, std::enable_if_t<is_condition_task_v<C>, void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::Condition>{}, std::forward<C>(c) + )); +} + +// Function: emplace +template <typename C, std::enable_if_t<is_multi_condition_task_v<C>, void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::MultiCondition>{}, std::forward<C>(c) + )); +} + +// Function: emplace +template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>*> +auto FlowBuilder::emplace(C&&... cs) { + return std::make_tuple(emplace(std::forward<C>(cs))...); +} + +// Function: erase +inline void FlowBuilder::erase(Task task) { + + if (!task._node) { + return; + } + + task.for_each_dependent([&] (Task dependent) { + auto& S = dependent._node->_successors; + if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) { + S.erase(I); + } + }); + + task.for_each_successor([&] (Task dependent) { + auto& D = dependent._node->_dependents; + if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) { + D.erase(I); + } + }); + + _graph._erase(task._node); +} + +// Function: composed_of +template <typename T> +Task FlowBuilder::composed_of(T& object) { + auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::Module>{}, object + ); + return Task(node); +} + +// Function: placeholder +inline Task FlowBuilder::placeholder() { + auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t<Node::Placeholder>{} + ); + return Task(node); +} + +// Procedure: _linearize +template <typename L> +void FlowBuilder::_linearize(L& keys) { + + auto itr = keys.begin(); + auto end = keys.end(); + + if(itr == end) { + return; + } + + auto nxt = itr; + + for(++nxt; nxt != end; ++nxt, ++itr) { + itr->_node->_precede(nxt->_node); + } +} + +// Procedure: linearize +inline void FlowBuilder::linearize(std::vector<Task>& keys) { + _linearize(keys); +} + +// Procedure: linearize +inline void FlowBuilder::linearize(std::initializer_list<Task> keys) { + _linearize(keys); +} + +// ---------------------------------------------------------------------------- + +/** +@class Subflow + +@brief class to construct a subflow graph from the execution of a dynamic task + +tf::Subflow is a derived class from tf::Runtime with a specialized mechanism +to manage the execution of a child graph. +By default, a subflow automatically @em joins its parent node. +You may explicitly join or detach a subflow by calling tf::Subflow::join +or tf::Subflow::detach, respectively. +The following example creates a taskflow graph that spawns a subflow from +the execution of task @c B, and the subflow contains three tasks, @c B1, +@c B2, and @c B3, where @c B3 runs after @c B1 and @c B2. + +@code{.cpp} +// create three static tasks +tf::Task A = taskflow.emplace([](){}).name("A"); +tf::Task C = taskflow.emplace([](){}).name("C"); +tf::Task D = taskflow.emplace([](){}).name("D"); + +// create a subflow graph (dynamic tasking) +tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) { + tf::Task B1 = subflow.emplace([](){}).name("B1"); + tf::Task B2 = subflow.emplace([](){}).name("B2"); + tf::Task B3 = subflow.emplace([](){}).name("B3"); + B1.precede(B3); + B2.precede(B3); +}).name("B"); + +A.precede(B); // B runs after A +A.precede(C); // C runs after A +B.precede(D); // D runs after B +C.precede(D); // D runs after C +@endcode + +*/ +class Subflow : public FlowBuilder, + public Runtime { + + friend class Executor; + friend class FlowBuilder; + friend class Runtime; + + public: + + /** + @brief enables the subflow to join its parent task + + Performs an immediate action to join the subflow. Once the subflow is joined, + it is considered finished and you may not modify the subflow anymore. + + @code{.cpp} + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + sf.join(); // join the subflow of one task + }); + @endcode + + Only the worker that spawns this subflow can join it. + */ + void join(); + + /** + @brief enables the subflow to detach from its parent task + + Performs an immediate action to detach the subflow. Once the subflow is detached, + it is considered finished and you may not modify the subflow anymore. + + @code{.cpp} + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + sf.detach(); + }); + @endcode + + Only the worker that spawns this subflow can detach it. + */ + void detach(); + + /** + @brief resets the subflow to a joinable state + + @param clear_graph specifies whether to clear the associated graph (default @c true) + + Clears the underlying task graph depending on the + given variable @c clear_graph (default @c true) and then + updates the subflow to a joinable state. + */ + void reset(bool clear_graph = true); + + /** + @brief queries if the subflow is joinable + + This member function queries if the subflow is joinable. + When a subflow is joined or detached, it becomes not joinable. + + @code{.cpp} + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + std::cout << sf.joinable() << '\n'; // true + sf.join(); + std::cout << sf.joinable() << '\n'; // false + }); + @endcode + */ + bool joinable() const noexcept; + + private: + + bool _joinable {true}; + + Subflow(Executor&, Worker&, Node*, Graph&); +}; + +// Constructor +inline Subflow::Subflow( + Executor& executor, Worker& worker, Node* parent, Graph& graph +) : + FlowBuilder {graph}, + Runtime {executor, worker, parent} { + // assert(_parent != nullptr); +} + +// Function: joined +inline bool Subflow::joinable() const noexcept { + return _joinable; +} + +// Procedure: reset +inline void Subflow::reset(bool clear_graph) { + if(clear_graph) { + _graph._clear(); + } + _joinable = true; +} + +} // end of namespace tf. --------------------------------------------------- + + + + + + + + + + diff --git a/myxpcs/include/taskflow_/core/graph.hpp b/myxpcs/include/taskflow_/core/graph.hpp new file mode 100644 index 0000000..f7af3e9 --- /dev/null +++ b/myxpcs/include/taskflow_/core/graph.hpp @@ -0,0 +1,1017 @@ +#pragma once + +#include "../utility/traits.hpp" +#include "../utility/iterator.hpp" +#include "../utility/object_pool.hpp" +#include "../utility/os.hpp" +#include "../utility/math.hpp" +#include "../utility/small_vector.hpp" +#include "../utility/serializer.hpp" +#include "error.hpp" +#include "declarations.hpp" +#include "semaphore.hpp" +#include "environment.hpp" +#include "topology.hpp" +#include "tsq.hpp" + +/** +@file graph.hpp +@brief graph include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Class: Graph +// ---------------------------------------------------------------------------- + +/** +@class Graph + +@brief class to create a graph object + +A graph is the ultimate storage for a task dependency graph and is the main +gateway to interact with an executor. +A graph manages a set of nodes in a global object pool that animates and +recycles node objects efficiently without going through repetitive and +expensive memory allocations and deallocations. +This class is mainly used for creating an opaque graph object in a custom +class to interact with the executor through taskflow composition. + +A graph object is move-only. +*/ +class Graph { + + friend class Node; + friend class FlowBuilder; + friend class Subflow; + friend class Taskflow; + friend class Executor; + + public: + + /** + @brief constructs a graph object + */ + Graph() = default; + + /** + @brief disabled copy constructor + */ + Graph(const Graph&) = delete; + + /** + @brief constructs a graph using move semantics + */ + Graph(Graph&&); + + /** + @brief destructs the graph object + */ + ~Graph(); + + /** + @brief disabled copy assignment operator + */ + Graph& operator = (const Graph&) = delete; + + /** + @brief assigns a graph using move semantics + */ + Graph& operator = (Graph&&); + + /** + @brief queries if the graph is empty + */ + bool empty() const; + + /** + @brief queries the number of nodes in the graph + */ + size_t size() const; + + /** + @brief clears the graph + */ + void clear(); + + private: + + std::vector<Node*> _nodes; + + void _clear(); + void _clear_detached(); + void _merge(Graph&&); + void _erase(Node*); + + /** + @private + */ + template <typename ...ArgsT> + Node* _emplace_back(ArgsT&&...); +}; + +// ---------------------------------------------------------------------------- + +/** +@class Runtime + +@brief class to include a runtime object in a task + +A runtime object allows users to interact with the +scheduling runtime inside a task, such as scheduling an active task, +spawning a subflow, and so on. + +@code{.cpp} +tf::Task A, B, C, D; +std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } +); +A.precede(B, C, D); +executor.run(taskflow).wait(); +@endcode + +A runtime object is associated with the worker and the executor +that runs the task. + +*/ +class Runtime { + + friend class Executor; + friend class FlowBuilder; + + public: + + /** + @brief destroys the runtime object + + Issues a tf::Runtime::corun_all to finish all spawned asynchronous tasks + and then destroys the runtime object. + */ + ~Runtime(); + + /** + @brief obtains the running executor + + The running executor of a runtime task is the executor that runs + the parent taskflow of that runtime task. + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow; + taskflow.emplace([&](tf::Runtime& rt){ + assert(&(rt.executor()) == &executor); + }); + executor.run(taskflow).wait(); + @endcode + */ + Executor& executor(); + + /** + @brief schedules an active task immediately to the worker's queue + + @param task the given active task to schedule immediately + + This member function immediately schedules an active task to the + task queue of the associated worker in the runtime task. + An active task is a task in a running taskflow. + The task may or may not be running, and scheduling that task + will immediately put the task into the task queue of the worker + that is running the runtime task. + Consider the following example: + + @code{.cpp} + tf::Task A, B, C, D; + std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } + ); + A.precede(B, C, D); + executor.run(taskflow).wait(); + @endcode + + The executor will first run the condition task @c A which returns @c 0 + to inform the scheduler to go to the runtime task @c B. + During the execution of @c B, it directly schedules task @c C without + going through the normal taskflow graph scheduling process. + At this moment, task @c C is active because its parent taskflow is running. + When the taskflow finishes, we will see both @c B and @c C in the output. + */ + void schedule(Task task); + + /** + @brief runs the given callable asynchronously + + @tparam F callable type + @param f callable object + + The method creates an asynchronous task to launch the given + function on the given arguments. + The difference to tf::Executor::async is that the created asynchronous task + pertains to the runtime object. + Applications can explicitly issue tf::Runtime::corun_all + to wait for all spawned asynchronous tasks to finish. + For example: + + @code{.cpp} + std::atomic<int> counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + auto fu1 = rt.async([&](){ counter++; }); + auto fu2 = rt.async([&](){ counter++; }); + fu1.get(); + fu2.get(); + assert(counter == 2); + + // spawn 100 asynchronous tasks from the worker of the runtime + for(int i=0; i<100; i++) { + rt.async([&](){ counter++; }); + } + + // wait for the 100 asynchronous tasks to finish + rt.corun_all(); + assert(counter == 102); + }); + @endcode + + This method is thread-safe and can be called by multiple workers + that hold the reference to the runtime. + For example, the code below spawns 100 tasks from the worker of + a runtime, and each of the 100 tasks spawns another task + that will be run by another worker. + + @code{.cpp} + std::atomic<int> counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + // worker of the runtime spawns 100 tasks each spawning another task + // that will be run by another worker + for(int i=0; i<100; i++) { + rt.async([&](){ + counter++; + rt.async([](){ counter++; }); + }); + } + + // wait for the 200 asynchronous tasks to finish + rt.corun_all(); + assert(counter == 200); + }); + @endcode + */ + template <typename F> + auto async(F&& f); + + /** + @brief similar to tf::Runtime::async but assigns the task a name + + @tparam F callable type + + @param name assigned name to the task + @param f callable + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + auto future = rt.async("my task", [](){}); + future.get(); + }); + @endcode + + */ + template <typename F> + auto async(const std::string& name, F&& f); + + /** + @brief runs the given function asynchronously without returning any future object + + @tparam F callable type + @param f callable + + This member function is more efficient than tf::Runtime::async + and is encouraged to use when there is no data returned. + + @code{.cpp} + std::atomic<int> counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun_all(); + assert(counter == 100); + }); + @endcode + + This member function is thread-safe. + */ + template <typename F> + void silent_async(F&& f); + + /** + @brief similar to tf::Runtime::silent_async but assigns the task a name + + @tparam F callable type + @param name assigned name to the task + @param f callable + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + rt.silent_async("my task", [](){}); + rt.corun_all(); + }); + @endcode + */ + template <typename F> + void silent_async(const std::string& name, F&& f); + + /** + @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime + + @tparam F callable type + + @param name assigned name to the task + @param f callable + + The method bypass the check of the caller worker from the executor + and thus can only called by the worker of this runtime. + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + // running by the worker of this runtime + rt.silent_async_unchecked("my task", [](){}); + rt.corun_all(); + }); + @endcode + */ + template <typename F> + void silent_async_unchecked(const std::string& name, F&& f); + + /** + @brief co-runs the given target and waits until it completes + + A target can be one of the following forms: + + a dynamic task to spawn a subflow or + + a composable graph object with `tf::Graph& T::graph()` defined + + @code{.cpp} + // co-run a subflow and wait until all tasks complete + taskflow.emplace([](tf::Runtime& rt){ + rt.corun([](tf::Subflow& sf){ + tf::Task A = sf.emplace([](){}); + tf::Task B = sf.emplace([](){}); + }); + }); + + // co-run a taskflow and wait until all tasks complete + tf::Taskflow taskflow1, taskflow2; + taskflow1.emplace([](){ std::cout << "running taskflow1\n"; }); + taskflow2.emplace([&](tf::Runtime& rt){ + std::cout << "running taskflow2\n"; + rt.corun(taskflow1); + }); + executor.run(taskflow2).wait(); + @endcode + + Although tf::Runtime::corun blocks until the operation completes, + the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). + Instead, the caller thread joins the work-stealing loop of the executor + and returns when all tasks in the target completes. + + @attention + Only the worker of this tf::Runtime can issue corun. + */ + template <typename T> + void corun(T&& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @attention + Only the worker of this tf::Runtime can issue corun. + */ + template <typename P> + void corun_until(P&& predicate); + + /** + @brief corun all asynchronous tasks spawned by this runtime with other workers + + Coruns all asynchronous tasks (tf::Runtime::async, + tf::Runtime::silent_async) with other workers until all those + asynchronous tasks finish. + + @code{.cpp} + std::atomic<size_t> counter{0}; + taskflow.emplace([&](tf::Runtime& rt){ + // spawn 100 async tasks and wait + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun_all(); + assert(counter == 100); + + // spawn another 100 async tasks and wait + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.corun_all(); + assert(counter == 200); + }); + @endcode + + @attention + Only the worker of this tf::Runtime can issue tf::Runtime::corun_all. + */ + inline void corun_all(); + + /** + @brief acquire a reference to the underlying worker + */ + inline Worker& worker(); + + protected: + + /** + @private + */ + explicit Runtime(Executor&, Worker&, Node*); + + /** + @private + */ + Executor& _executor; + + /** + @private + */ + Worker& _worker; + + /** + @private + */ + Node* _parent; + + /** + @private + */ + template <typename F> + auto _async(Worker& w, const std::string& name, F&& f); + + /** + @private + */ + template <typename F> + void _silent_async(Worker& w, const std::string& name, F&& f); +}; + +// constructor +inline Runtime::Runtime(Executor& e, Worker& w, Node* p) : + _executor{e}, + _worker {w}, + _parent {p}{ +} + +// Function: executor +inline Executor& Runtime::executor() { + return _executor; +} + +// Function: worker +inline Worker& Runtime::worker() { + return _worker; +} + +// ---------------------------------------------------------------------------- +// Node +// ---------------------------------------------------------------------------- + +/** +@private +*/ +class Node { + + friend class Graph; + friend class Task; + friend class AsyncTask; + friend class TaskView; + friend class Taskflow; + friend class Executor; + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + enum class AsyncState : int { + UNFINISHED = 0, + LOCKED = 1, + FINISHED = 2 + }; + + TF_ENABLE_POOLABLE_ON_THIS; + + // state bit flag + constexpr static int CONDITIONED = 1; + constexpr static int DETACHED = 2; + constexpr static int ACQUIRED = 4; + constexpr static int READY = 8; + + using Placeholder = std::monostate; + + // static work handle + struct Static { + + template <typename C> + Static(C&&); + + std::variant< + std::function<void()>, std::function<void(Runtime&)> + > work; + }; + + // dynamic work handle + struct Dynamic { + + template <typename C> + Dynamic(C&&); + + std::function<void(Subflow&)> work; + Graph subgraph; + }; + + // condition work handle + struct Condition { + + template <typename C> + Condition(C&&); + + std::variant< + std::function<int()>, std::function<int(Runtime&)> + > work; + }; + + // multi-condition work handle + struct MultiCondition { + + template <typename C> + MultiCondition(C&&); + + std::variant< + std::function<SmallVector<int>()>, std::function<SmallVector<int>(Runtime&)> + > work; + }; + + // module work handle + struct Module { + + template <typename T> + Module(T&); + + Graph& graph; + }; + + // Async work + struct Async { + + template <typename T> + Async(T&&); + + std::variant< + std::function<void()>, std::function<void(Runtime&)> + > work; + }; + + // silent dependent async + struct DependentAsync { + + template <typename C> + DependentAsync(C&&); + + std::variant< + std::function<void()>, std::function<void(Runtime&)> + > work; + + std::atomic<size_t> use_count {1}; + std::atomic<AsyncState> state {AsyncState::UNFINISHED}; + }; + + using handle_t = std::variant< + Placeholder, // placeholder + Static, // static tasking + Dynamic, // dynamic tasking + Condition, // conditional tasking + MultiCondition, // multi-conditional tasking + Module, // composable tasking + Async, // async tasking + DependentAsync // dependent async tasking + >; + + struct Semaphores { + SmallVector<Semaphore*> to_acquire; + SmallVector<Semaphore*> to_release; + }; + + public: + + // variant index + constexpr static auto PLACEHOLDER = get_index_v<Placeholder, handle_t>; + constexpr static auto STATIC = get_index_v<Static, handle_t>; + constexpr static auto DYNAMIC = get_index_v<Dynamic, handle_t>; + constexpr static auto CONDITION = get_index_v<Condition, handle_t>; + constexpr static auto MULTI_CONDITION = get_index_v<MultiCondition, handle_t>; + constexpr static auto MODULE = get_index_v<Module, handle_t>; + constexpr static auto ASYNC = get_index_v<Async, handle_t>; + constexpr static auto DEPENDENT_ASYNC = get_index_v<DependentAsync, handle_t>; + + Node() = default; + + template <typename... Args> + Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&... args); + + ~Node(); + + size_t num_successors() const; + size_t num_dependents() const; + size_t num_strong_dependents() const; + size_t num_weak_dependents() const; + + const std::string& name() const; + + private: + + std::string _name; + + unsigned _priority {0}; + + Topology* _topology {nullptr}; + Node* _parent {nullptr}; + + void* _data {nullptr}; + + SmallVector<Node*> _successors; + SmallVector<Node*> _dependents; + + std::atomic<int> _state {0}; + std::atomic<size_t> _join_counter {0}; + + std::unique_ptr<Semaphores> _semaphores; + + handle_t _handle; + + void _precede(Node*); + void _set_up_join_counter(); + + bool _is_cancelled() const; + bool _is_conditioner() const; + bool _acquire_all(SmallVector<Node*>&); + + SmallVector<Node*> _release_all(); +}; + +// ---------------------------------------------------------------------------- +// Node Object Pool +// ---------------------------------------------------------------------------- + +/** +@private +*/ +inline ObjectPool<Node> node_pool; + +// ---------------------------------------------------------------------------- +// Definition for Node::Static +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::Static::Static(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Dynamic +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::Dynamic::Dynamic(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Condition +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::Condition::Condition(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::MultiCondition +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::MultiCondition::MultiCondition(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Module +// ---------------------------------------------------------------------------- + +// Constructor +template <typename T> +inline Node::Module::Module(T& obj) : graph{ obj.graph() } { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Async +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::Async::Async(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::DependentAsync +// ---------------------------------------------------------------------------- + +// Constructor +template <typename C> +Node::DependentAsync::DependentAsync(C&& c) : work {std::forward<C>(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node +// ---------------------------------------------------------------------------- + +// Constructor +template <typename... Args> +Node::Node( + const std::string& name, + unsigned priority, + Topology* topology, + Node* parent, + size_t join_counter, + Args&&... args +) : + _name {name}, + _priority {priority}, + _topology {topology}, + _parent {parent}, + _join_counter {join_counter}, + _handle {std::forward<Args>(args)...} { +} + +// Destructor +inline Node::~Node() { + // this is to avoid stack overflow + + if(_handle.index() == DYNAMIC) { + // using std::get_if instead of std::get makes this compatible + // with older macOS versions + // the result of std::get_if is guaranteed to be non-null + // due to the index check above + auto& subgraph = std::get_if<Dynamic>(&_handle)->subgraph; + std::vector<Node*> nodes; + nodes.reserve(subgraph.size()); + + std::move( + subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes) + ); + subgraph._nodes.clear(); + + size_t i = 0; + + while(i < nodes.size()) { + + if(nodes[i]->_handle.index() == DYNAMIC) { + auto& sbg = std::get_if<Dynamic>(&(nodes[i]->_handle))->subgraph; + std::move( + sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes) + ); + sbg._nodes.clear(); + } + + ++i; + } + + //auto& np = Graph::_node_pool(); + for(i=0; i<nodes.size(); ++i) { + node_pool.recycle(nodes[i]); + } + } +} + +// Procedure: _precede +inline void Node::_precede(Node* v) { + _successors.push_back(v); + v->_dependents.push_back(this); +} + +// Function: num_successors +inline size_t Node::num_successors() const { + return _successors.size(); +} + +// Function: dependents +inline size_t Node::num_dependents() const { + return _dependents.size(); +} + +// Function: num_weak_dependents +inline size_t Node::num_weak_dependents() const { + size_t n = 0; + for(size_t i=0; i<_dependents.size(); i++) { + //if(_dependents[i]->_handle.index() == Node::CONDITION) { + if(_dependents[i]->_is_conditioner()) { + n++; + } + } + return n; +} + +// Function: num_strong_dependents +inline size_t Node::num_strong_dependents() const { + size_t n = 0; + for(size_t i=0; i<_dependents.size(); i++) { + //if(_dependents[i]->_handle.index() != Node::CONDITION) { + if(!_dependents[i]->_is_conditioner()) { + n++; + } + } + return n; +} + +// Function: name +inline const std::string& Node::name() const { + return _name; +} + +// Function: _is_conditioner +inline bool Node::_is_conditioner() const { + return _handle.index() == Node::CONDITION || + _handle.index() == Node::MULTI_CONDITION; +} + +// Function: _is_cancelled +// we currently only support cancellation of taskflow (no async task) +inline bool Node::_is_cancelled() const { + //return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed); + return _topology && + (_topology->_state.load(std::memory_order_relaxed) & Topology::CANCELLED); +} + +// Procedure: _set_up_join_counter +inline void Node::_set_up_join_counter() { + size_t c = 0; + for(auto p : _dependents) { + //if(p->_handle.index() == Node::CONDITION) { + if(p->_is_conditioner()) { + _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed); + } + else { + c++; + } + } + _join_counter.store(c, std::memory_order_relaxed); +} + + +// Function: _acquire_all +inline bool Node::_acquire_all(SmallVector<Node*>& nodes) { + + auto& to_acquire = _semaphores->to_acquire; + + for(size_t i = 0; i < to_acquire.size(); ++i) { + if(!to_acquire[i]->_try_acquire_or_wait(this)) { + for(size_t j = 1; j <= i; ++j) { + auto r = to_acquire[i-j]->_release(); + nodes.insert(std::end(nodes), std::begin(r), std::end(r)); + } + return false; + } + } + return true; +} + +// Function: _release_all +inline SmallVector<Node*> Node::_release_all() { + + auto& to_release = _semaphores->to_release; + + SmallVector<Node*> nodes; + for(const auto& sem : to_release) { + auto r = sem->_release(); + nodes.insert(std::end(nodes), std::begin(r), std::end(r)); + } + + return nodes; +} + +// ---------------------------------------------------------------------------- +// Node Deleter +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct NodeDeleter { + void operator ()(Node* ptr) { + node_pool.recycle(ptr); + } +}; + +// ---------------------------------------------------------------------------- +// Graph definition +// ---------------------------------------------------------------------------- + +// Destructor +inline Graph::~Graph() { + _clear(); +} + +// Move constructor +inline Graph::Graph(Graph&& other) : + _nodes {std::move(other._nodes)} { +} + +// Move assignment +inline Graph& Graph::operator = (Graph&& other) { + _clear(); + _nodes = std::move(other._nodes); + return *this; +} + +// Procedure: clear +inline void Graph::clear() { + _clear(); +} + +// Procedure: clear +inline void Graph::_clear() { + for(auto node : _nodes) { + node_pool.recycle(node); + } + _nodes.clear(); +} + +// Procedure: clear_detached +inline void Graph::_clear_detached() { + + auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) { + return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED); + }); + + for(auto itr = mid; itr != _nodes.end(); ++itr) { + node_pool.recycle(*itr); + } + _nodes.resize(std::distance(_nodes.begin(), mid)); +} + +// Procedure: merge +inline void Graph::_merge(Graph&& g) { + for(auto n : g._nodes) { + _nodes.push_back(n); + } + g._nodes.clear(); +} + +// Function: erase +inline void Graph::_erase(Node* node) { + if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) { + _nodes.erase(I); + node_pool.recycle(node); + } +} + +// Function: size +inline size_t Graph::size() const { + return _nodes.size(); +} + +// Function: empty +inline bool Graph::empty() const { + return _nodes.empty(); +} + +/** +@private +*/ +template <typename ...ArgsT> +Node* Graph::_emplace_back(ArgsT&&... args) { + _nodes.push_back(node_pool.animate(std::forward<ArgsT>(args)...)); + return _nodes.back(); +} + +} // end of namespace tf. --------------------------------------------------- diff --git a/myxpcs/include/taskflow_/core/notifier.hpp b/myxpcs/include/taskflow_/core/notifier.hpp new file mode 100644 index 0000000..6bec325 --- /dev/null +++ b/myxpcs/include/taskflow_/core/notifier.hpp @@ -0,0 +1,295 @@ +// 2019/02/09 - created by Tsung-Wei Huang +// - modified the event count from Eigen + +#pragma once + +#include <iostream> +#include <vector> +#include <cstdlib> +#include <cstdio> +#include <atomic> +#include <memory> +#include <deque> +#include <mutex> +#include <condition_variable> +#include <thread> +#include <algorithm> +#include <numeric> +#include <cassert> + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +namespace tf { + +// Notifier allows to wait for arbitrary predicates in non-blocking +// algorithms. Think of condition variable, but wait predicate does not need to +// be protected by a mutex. Usage: +// Waiting thread does: +// +// if (predicate) +// return act(); +// Notifier::Waiter& w = waiters[my_index]; +// ec.prepare_wait(&w); +// if (predicate) { +// ec.cancel_wait(&w); +// return act(); +// } +// ec.commit_wait(&w); +// +// Notifying thread does: +// +// predicate = true; +// ec.notify(true); +// +// notify is cheap if there are no waiting threads. prepare_wait/commit_wait are not +// cheap, but they are executed only if the preceeding predicate check has +// failed. +// +// Algorihtm outline: +// There are two main variables: predicate (managed by user) and _state. +// Operation closely resembles Dekker mutual algorithm: +// https://en.wikipedia.org/wiki/Dekker%27s_algorithm +// Waiting thread sets _state then checks predicate, Notifying thread sets +// predicate then checks _state. Due to seq_cst fences in between these +// operations it is guaranteed than either waiter will see predicate change +// and won't block, or notifying thread will see _state change and will unblock +// the waiter, or both. But it can't happen that both threads don't see each +// other changes, which would lead to deadlock. +class Notifier { + + friend class Executor; + + public: + + struct Waiter { + std::atomic<Waiter*> next; + uint64_t epoch; + enum : unsigned { + kNotSignaled = 0, + kWaiting, + kSignaled, + }; + +#ifdef __cpp_lib_atomic_wait + std::atomic<unsigned> state {0}; +#else + std::mutex mu; + std::condition_variable cv; + unsigned state; +#endif + }; + + explicit Notifier(size_t N) : _waiters{N} { + assert(_waiters.size() < (1 << kWaiterBits) - 1); + // Initialize epoch to something close to overflow to test overflow. + _state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2); + } + + ~Notifier() { + // Ensure there are no waiters. + assert((_state.load() & (kStackMask | kWaiterMask)) == kStackMask); + } + + // prepare_wait prepares for waiting. + // After calling this function the thread must re-check the wait predicate + // and call either cancel_wait or commit_wait passing the same Waiter object. + void prepare_wait(Waiter* w) { + w->epoch = _state.fetch_add(kWaiterInc, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + // commit_wait commits waiting. + void commit_wait(Waiter* w) { +#ifdef __cpp_lib_atomic_wait + w->state.store(Waiter::kNotSignaled, std::memory_order_relaxed); +#else + w->state = Waiter::kNotSignaled; +#endif + // Modification epoch of this waiter. + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = _state.load(std::memory_order_seq_cst); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either cancel_wait or commit_wait, or is notified. + std::this_thread::yield(); + state = _state.load(std::memory_order_seq_cst); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter and add it to the waiter list. + assert((state & kWaiterMask) != 0); + uint64_t newstate = state - kWaiterInc + kEpochInc; + //newstate = (newstate & ~kStackMask) | (w - &_waiters[0]); + newstate = static_cast<uint64_t>((newstate & ~kStackMask) | static_cast<uint64_t>(w - &_waiters[0])); + if ((state & kStackMask) == kStackMask) + w->next.store(nullptr, std::memory_order_relaxed); + else + w->next.store(&_waiters[state & kStackMask], std::memory_order_relaxed); + if (_state.compare_exchange_weak(state, newstate, + std::memory_order_release)) + break; + } + _park(w); + } + + // cancel_wait cancels effects of the previous prepare_wait call. + void cancel_wait(Waiter* w) { + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = _state.load(std::memory_order_relaxed); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either cancel_wait or commit_wait, or is notified. + std::this_thread::yield(); + state = _state.load(std::memory_order_relaxed); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter. + assert((state & kWaiterMask) != 0); + if (_state.compare_exchange_weak(state, state - kWaiterInc + kEpochInc, + std::memory_order_relaxed)) + return; + } + } + + // notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + void notify(bool all) { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = _state.load(std::memory_order_acquire); + for (;;) { + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0) + return; + uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + uint64_t newstate; + if (all) { + // Reset prewait counter and empty wait list. + newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; + } else if (waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kEpochInc - kWaiterInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &_waiters[state & kStackMask]; + Waiter* wnext = w->next.load(std::memory_order_relaxed); + uint64_t next = kStackMask; + //if (wnext != nullptr) next = wnext - &_waiters[0]; + if (wnext != nullptr) next = static_cast<uint64_t>(wnext - &_waiters[0]); + // Note: we don't add kEpochInc here. ABA problem on the lock-free stack + // can't happen because a waiter is re-pushed onto the stack only after + // it was in the pre-wait state which inevitably leads to epoch + // increment. + newstate = (state & kEpochMask) + next; + } + if (_state.compare_exchange_weak(state, newstate, + std::memory_order_acquire)) { + if (!all && waiters) return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &_waiters[state & kStackMask]; + if (!all) w->next.store(nullptr, std::memory_order_relaxed); + _unpark(w); + return; + } + } + } + + // notify n workers + void notify_n(size_t n) { + if(n >= _waiters.size()) { + notify(true); + } + else { + for(size_t k=0; k<n; ++k) { + notify(false); + } + } + } + + size_t size() const { + return _waiters.size(); + } + + private: + + // State_ layout: + // - low kStackBits is a stack of waiters committed wait. + // - next kWaiterBits is count of waiters in prewait state. + // - next kEpochBits is modification counter. + static const uint64_t kStackBits = 16; + static const uint64_t kStackMask = (1ull << kStackBits) - 1; + static const uint64_t kWaiterBits = 16; + static const uint64_t kWaiterShift = 16; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) + << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterBits; + static const uint64_t kEpochBits = 32; + static const uint64_t kEpochShift = 32; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + std::atomic<uint64_t> _state; + std::vector<Waiter> _waiters; + + void _park(Waiter* w) { +#ifdef __cpp_lib_atomic_wait + unsigned target = Waiter::kNotSignaled; + if(w->state.compare_exchange_strong(target, Waiter::kWaiting, + std::memory_order_relaxed, + std::memory_order_relaxed)) { + w->state.wait(Waiter::kWaiting, std::memory_order_relaxed); + } +#else + std::unique_lock<std::mutex> lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } +#endif + } + + void _unpark(Waiter* waiters) { + Waiter* next = nullptr; + for (Waiter* w = waiters; w; w = next) { + next = w->next.load(std::memory_order_relaxed); +#ifdef __cpp_lib_atomic_wait + // We only notify if the other is waiting - this is why we use tri-state + // variable instead of binary-state variable (i.e., atomic_flag) + // Performance is about 0.1% faster + if(w->state.exchange(Waiter::kSignaled, std::memory_order_relaxed) == + Waiter::kWaiting) { + w->state.notify_one(); + } +#else + unsigned state; + { + std::unique_lock<std::mutex> lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); +#endif + } + } + +}; + + + +} // namespace tf ------------------------------------------------------------ + diff --git a/myxpcs/include/taskflow_/core/observer.hpp b/myxpcs/include/taskflow_/core/observer.hpp new file mode 100644 index 0000000..3c1873e --- /dev/null +++ b/myxpcs/include/taskflow_/core/observer.hpp @@ -0,0 +1,1046 @@ +#pragma once + +#include "task.hpp" +#include "worker.hpp" + +/** +@file observer.hpp +@brief observer include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// timeline data structure +// ---------------------------------------------------------------------------- + +/** +@brief default time point type of observers +*/ +using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>; + +/** +@private +*/ +struct Segment { + + std::string name; + TaskType type; + + observer_stamp_t beg; + observer_stamp_t end; + + template <typename Archiver> + auto save(Archiver& ar) const { + return ar(name, type, beg, end); + } + + template <typename Archiver> + auto load(Archiver& ar) { + return ar(name, type, beg, end); + } + + Segment() = default; + + Segment( + const std::string& n, TaskType t, observer_stamp_t b, observer_stamp_t e + ) : name {n}, type {t}, beg {b}, end {e} { + } + + auto span() const { + return end-beg; + } +}; + +/** +@private +*/ +struct Timeline { + + size_t uid; + + observer_stamp_t origin; + std::vector<std::vector<std::vector<Segment>>> segments; + + Timeline() = default; + + Timeline(const Timeline& rhs) = delete; + Timeline(Timeline&& rhs) = default; + + Timeline& operator = (const Timeline& rhs) = delete; + Timeline& operator = (Timeline&& rhs) = default; + + template <typename Archiver> + auto save(Archiver& ar) const { + return ar(uid, origin, segments); + } + + template <typename Archiver> + auto load(Archiver& ar) { + return ar(uid, origin, segments); + } +}; + +/** +@private + */ +struct ProfileData { + + std::vector<Timeline> timelines; + + ProfileData() = default; + + ProfileData(const ProfileData& rhs) = delete; + ProfileData(ProfileData&& rhs) = default; + + ProfileData& operator = (const ProfileData& rhs) = delete; + ProfileData& operator = (ProfileData&&) = default; + + template <typename Archiver> + auto save(Archiver& ar) const { + return ar(timelines); + } + + template <typename Archiver> + auto load(Archiver& ar) { + return ar(timelines); + } +}; + +// ---------------------------------------------------------------------------- +// observer interface +// ---------------------------------------------------------------------------- + +/** +@class: ObserverInterface + +@brief class to derive an executor observer + +The tf::ObserverInterface class allows users to define custom methods to monitor +the behaviors of an executor. This is particularly useful when you want to +inspect the performance of an executor and visualize when each thread +participates in the execution of a task. +To prevent users from direct access to the internal threads and tasks, +tf::ObserverInterface provides immutable wrappers, +tf::WorkerView and tf::TaskView, over workers and tasks. + +Please refer to tf::WorkerView and tf::TaskView for details. + +Example usage: + +@code{.cpp} + +struct MyObserver : public tf::ObserverInterface { + + MyObserver(const std::string& name) { + std::cout << "constructing observer " << name << '\n'; + } + + void set_up(size_t num_workers) override final { + std::cout << "setting up observer with " << num_workers << " workers\n"; + } + + void on_entry(WorkerView w, tf::TaskView tv) override final { + std::ostringstream oss; + oss << "worker " << w.id() << " ready to run " << tv.name() << '\n'; + std::cout << oss.str(); + } + + void on_exit(WorkerView w, tf::TaskView tv) override final { + std::ostringstream oss; + oss << "worker " << w.id() << " finished running " << tv.name() << '\n'; + std::cout << oss.str(); + } +}; + +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr<MyObserver> observer = executor.make_observer<MyObserver>("MyObserver"); + +// run the taskflow +executor.run(taskflow).wait(); +@endcode +*/ +class ObserverInterface { + + public: + + /** + @brief virtual destructor + */ + virtual ~ObserverInterface() = default; + + /** + @brief constructor-like method to call when the executor observer is fully created + @param num_workers the number of the worker threads in the executor + */ + virtual void set_up(size_t num_workers) = 0; + + /** + @brief method to call before a worker thread executes a closure + @param wv an immutable view of this worker thread + @param task_view a constant wrapper object to the task + */ + virtual void on_entry(WorkerView wv, TaskView task_view) = 0; + + /** + @brief method to call after a worker thread executed a closure + @param wv an immutable view of this worker thread + @param task_view a constant wrapper object to the task + */ + virtual void on_exit(WorkerView wv, TaskView task_view) = 0; +}; + +// ---------------------------------------------------------------------------- +// ChromeObserver definition +// ---------------------------------------------------------------------------- + +/** +@class: ChromeObserver + +@brief class to create an observer based on Chrome tracing format + +A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump +the observed thread activities into a format that can be visualized through +@ChromeTracing. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr<tf::ChromeObserver> observer = executor.make_observer<tf::ChromeObserver>(); + +// run the taskflow +executor.run(taskflow).wait(); + +// dump the thread activities to a chrome-tracing format. +observer->dump(std::cout); +@endcode +*/ +class ChromeObserver : public ObserverInterface { + + friend class Executor; + + // data structure to record each task execution + struct Segment { + + std::string name; + + observer_stamp_t beg; + observer_stamp_t end; + + Segment( + const std::string& n, + observer_stamp_t b, + observer_stamp_t e + ); + }; + + // data structure to store the entire execution timeline + struct Timeline { + observer_stamp_t origin; + std::vector<std::vector<Segment>> segments; + std::vector<std::stack<observer_stamp_t>> stacks; + }; + + public: + + /** + @brief dumps the timelines into a @ChromeTracing format through + an output stream + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the timelines into a @ChromeTracing format + */ + inline std::string dump() const; + + /** + @brief clears the timeline data + */ + inline void clear(); + + /** + @brief queries the number of tasks observed + */ + inline size_t num_tasks() const; + + private: + + inline void set_up(size_t num_workers) override final; + inline void on_entry(WorkerView w, TaskView task_view) override final; + inline void on_exit(WorkerView w, TaskView task_view) override final; + + Timeline _timeline; +}; + +// constructor +inline ChromeObserver::Segment::Segment( + const std::string& n, observer_stamp_t b, observer_stamp_t e +) : + name {n}, beg {b}, end {e} { +} + +// Procedure: set_up +inline void ChromeObserver::set_up(size_t num_workers) { + _timeline.segments.resize(num_workers); + _timeline.stacks.resize(num_workers); + + for(size_t w=0; w<num_workers; ++w) { + _timeline.segments[w].reserve(32); + } + + _timeline.origin = observer_stamp_t::clock::now(); +} + +// Procedure: on_entry +inline void ChromeObserver::on_entry(WorkerView wv, TaskView) { + _timeline.stacks[wv.id()].push(observer_stamp_t::clock::now()); +} + +// Procedure: on_exit +inline void ChromeObserver::on_exit(WorkerView wv, TaskView tv) { + + size_t w = wv.id(); + + assert(!_timeline.stacks[w].empty()); + + auto beg = _timeline.stacks[w].top(); + _timeline.stacks[w].pop(); + + _timeline.segments[w].emplace_back( + tv.name(), beg, observer_stamp_t::clock::now() + ); +} + +// Function: clear +inline void ChromeObserver::clear() { + for(size_t w=0; w<_timeline.segments.size(); ++w) { + _timeline.segments[w].clear(); + while(!_timeline.stacks[w].empty()) { + _timeline.stacks[w].pop(); + } + } +} + +// Procedure: dump +inline void ChromeObserver::dump(std::ostream& os) const { + + using namespace std::chrono; + + size_t first; + + for(first = 0; first<_timeline.segments.size(); ++first) { + if(_timeline.segments[first].size() > 0) { + break; + } + } + + os << '['; + + for(size_t w=first; w<_timeline.segments.size(); w++) { + + if(w != first && _timeline.segments[w].size() > 0) { + os << ','; + } + + for(size_t i=0; i<_timeline.segments[w].size(); i++) { + + os << '{'<< "\"cat\":\"ChromeObserver\","; + + // name field + os << "\"name\":\""; + if(_timeline.segments[w][i].name.empty()) { + os << w << '_' << i; + } + else { + os << _timeline.segments[w][i].name; + } + os << "\","; + + // segment field + os << "\"ph\":\"X\"," + << "\"pid\":1," + << "\"tid\":" << w << ',' + << "\"ts\":" << duration_cast<microseconds>( + _timeline.segments[w][i].beg - _timeline.origin + ).count() << ',' + << "\"dur\":" << duration_cast<microseconds>( + _timeline.segments[w][i].end - _timeline.segments[w][i].beg + ).count(); + + if(i != _timeline.segments[w].size() - 1) { + os << "},"; + } + else { + os << '}'; + } + } + } + os << "]\n"; +} + +// Function: dump +inline std::string ChromeObserver::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Function: num_tasks +inline size_t ChromeObserver::num_tasks() const { + return std::accumulate( + _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, + [](size_t sum, const auto& exe){ + return sum + exe.size(); + } + ); +} + +// ---------------------------------------------------------------------------- +// TFProfObserver definition +// ---------------------------------------------------------------------------- + +/** +@class TFProfObserver + +@brief class to create an observer based on the built-in taskflow profiler format + +A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump +the observed thread activities into a format that can be visualized through +@TFProf. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr<tf::TFProfObserver> observer = executor.make_observer<tf::TFProfObserver>(); + +// run the taskflow +executor.run(taskflow).wait(); + +// dump the thread activities to Taskflow Profiler format. +observer->dump(std::cout); +@endcode + +*/ +class TFProfObserver : public ObserverInterface { + + friend class Executor; + friend class TFProfManager; + + /** @private overall task summary */ + struct TaskSummary { + size_t count {0}; + size_t total_span {0}; + size_t min_span; + size_t max_span; + + float avg_span() const { return total_span * 1.0f / count; } + }; + + /** @private worker summary at a level */ + struct WorkerSummary { + + size_t id; + size_t level; + size_t count {0}; + size_t total_span {0}; + size_t min_span{0}; + size_t max_span{0}; + + std::array<TaskSummary, TASK_TYPES.size()> tsum; + + float avg_span() const { return total_span * 1.0f / count; } + //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); + }; + + /** @private */ + struct Summary { + std::array<TaskSummary, TASK_TYPES.size()> tsum; + std::vector<WorkerSummary> wsum; + + void dump_tsum(std::ostream&) const; + void dump_wsum(std::ostream&) const; + void dump(std::ostream&) const; + }; + + public: + + /** + @brief dumps the timelines into a @TFProf format through + an output stream + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the timelines into a JSON string + */ + std::string dump() const; + + /** + @brief shows the summary report through an output stream + */ + void summary(std::ostream& ostream) const; + + /** + @brief returns the summary report in a string + */ + std::string summary() const; + + /** + @brief clears the timeline data + */ + void clear(); + + /** + @brief queries the number of tasks observed + */ + size_t num_tasks() const; + + /** + @brief queries the number of observed workers + */ + size_t num_workers() const; + + private: + + Timeline _timeline; + + std::vector<std::stack<observer_stamp_t>> _stacks; + + inline void set_up(size_t num_workers) override final; + inline void on_entry(WorkerView, TaskView) override final; + inline void on_exit(WorkerView, TaskView) override final; +}; + + +// dump the task summary +inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const { + + // task summary + size_t type_w{10}, count_w{5}, time_w{9}, avg_w{8}, min_w{8}, max_w{8}; + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + count_w = std::max(count_w, std::to_string(i.count).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + time_w = std::max(time_w, std::to_string(i.total_span).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + avg_w = std::max(time_w, std::to_string(i.avg_span()).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + min_w = std::max(min_w, std::to_string(i.min_span).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + max_w = std::max(max_w, std::to_string(i.max_span).size()); + }); + + os << std::setw(type_w) << "-Task-" + << std::setw(count_w+2) << "Count" + << std::setw(time_w+2) << "Time (us)" + << std::setw(avg_w+2) << "Avg (us)" + << std::setw(min_w+2) << "Min (us)" + << std::setw(max_w+2) << "Max (us)" + << '\n'; + + for(size_t i=0; i<TASK_TYPES.size(); i++) { + if(tsum[i].count == 0) { + continue; + } + os << std::setw(type_w) << to_string(TASK_TYPES[i]) + << std::setw(count_w+2) << tsum[i].count + << std::setw(time_w+2) << tsum[i].total_span + << std::setw(avg_w+2) << std::to_string(tsum[i].avg_span()) + << std::setw(min_w+2) << tsum[i].min_span + << std::setw(max_w+2) << tsum[i].max_span + << '\n'; + } +} + +// dump the worker summary +inline void TFProfObserver::Summary::dump_wsum(std::ostream& os) const { + + // task summary + size_t w_w{10}, t_w{10}, l_w{5}, c_w{5}, d_w{9}, avg_w{8}, min_w{8}, max_w{8}; + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + l_w = std::max(l_w, std::to_string(i.level).size()); + }); + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + c_w = std::max(c_w, std::to_string(i.count).size()); + }); + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + d_w = std::max(d_w, std::to_string(i.total_span).size()); + }); + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + avg_w = std::max(avg_w, std::to_string(i.avg_span()).size()); + }); + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + min_w = std::max(min_w, std::to_string(i.min_span).size()); + }); + + std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){ + if(i.count == 0) return; + max_w = std::max(max_w, std::to_string(i.max_span).size()); + }); + + os << std::setw(w_w) << "-Worker-" + << std::setw(l_w+2) << "Level" + << std::setw(t_w) << "Task" + << std::setw(c_w+2) << "Count" + << std::setw(d_w+2) << "Time (us)" + << std::setw(avg_w+2) << "Avg (us)" + << std::setw(min_w+2) << "Min (us)" + << std::setw(max_w+2) << "Max (us)" + << '\n'; + + for(const auto& ws : wsum) { + + if(ws.count == 0) { + continue; + } + + os << std::setw(w_w) << ws.id + << std::setw(l_w+2) << ws.level; + + bool first = true; + for(size_t i=0; i<TASK_TYPES.size(); i++) { + + if(ws.tsum[i].count == 0) { + continue; + } + + os << (first ? std::setw(t_w) : std::setw(w_w + l_w + 2 + t_w)); + first = false; + + os << to_string(TASK_TYPES[i]) + << std::setw(c_w+2) << ws.tsum[i].count + << std::setw(d_w+2) << ws.tsum[i].total_span + << std::setw(avg_w+2) << std::to_string(ws.tsum[i].avg_span()) + << std::setw(min_w+2) << ws.tsum[i].min_span + << std::setw(max_w+2) << ws.tsum[i].max_span + << '\n'; + } + + // per-worker summary + os << std::setw(w_w + l_w + t_w + c_w + 4) << ws.count + << std::setw(d_w+2) << ws.total_span + << std::setw(avg_w+2) << std::to_string(ws.avg_span()) + << std::setw(min_w+2) << ws.min_span + << std::setw(max_w+2) << ws.max_span + << '\n'; + + //for(size_t j=0; j<w_w+l_w+t_w+4; j++) os << ' '; + //for(size_t j=0; j<c_w+d_w+avg_w+min_w+max_w+8; j++) os << '-'; + //os <<'\n'; + } +} + +// dump the summary report through an ostream +inline void TFProfObserver::Summary::dump(std::ostream& os) const { + dump_tsum(os); + os << '\n'; + dump_wsum(os); +} + +// Procedure: set_up +inline void TFProfObserver::set_up(size_t num_workers) { + _timeline.uid = unique_id<size_t>(); + _timeline.origin = observer_stamp_t::clock::now(); + _timeline.segments.resize(num_workers); + _stacks.resize(num_workers); +} + +// Procedure: on_entry +inline void TFProfObserver::on_entry(WorkerView wv, TaskView) { + _stacks[wv.id()].push(observer_stamp_t::clock::now()); +} + +// Procedure: on_exit +inline void TFProfObserver::on_exit(WorkerView wv, TaskView tv) { + + size_t w = wv.id(); + + assert(!_stacks[w].empty()); + + if(_stacks[w].size() > _timeline.segments[w].size()) { + _timeline.segments[w].resize(_stacks[w].size()); + } + + auto beg = _stacks[w].top(); + _stacks[w].pop(); + + _timeline.segments[w][_stacks[w].size()].emplace_back( + tv.name(), tv.type(), beg, observer_stamp_t::clock::now() + ); +} + +// Function: clear +inline void TFProfObserver::clear() { + for(size_t w=0; w<_timeline.segments.size(); ++w) { + for(size_t l=0; l<_timeline.segments[w].size(); ++l) { + _timeline.segments[w][l].clear(); + } + while(!_stacks[w].empty()) { + _stacks[w].pop(); + } + } +} + +// Procedure: dump +inline void TFProfObserver::dump(std::ostream& os) const { + + using namespace std::chrono; + + size_t first; + + for(first = 0; first<_timeline.segments.size(); ++first) { + if(_timeline.segments[first].size() > 0) { + break; + } + } + + // not timeline data to dump + if(first == _timeline.segments.size()) { + os << "{}\n"; + return; + } + + os << "{\"executor\":\"" << _timeline.uid << "\",\"data\":["; + + bool comma = false; + + for(size_t w=first; w<_timeline.segments.size(); w++) { + for(size_t l=0; l<_timeline.segments[w].size(); l++) { + + if(_timeline.segments[w][l].empty()) { + continue; + } + + if(comma) { + os << ','; + } + else { + comma = true; + } + + os << "{\"worker\":" << w << ",\"level\":" << l << ",\"data\":["; + for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) { + + const auto& s = _timeline.segments[w][l][i]; + + if(i) os << ','; + + // span + os << "{\"span\":[" + << duration_cast<microseconds>(s.beg - _timeline.origin).count() + << "," + << duration_cast<microseconds>(s.end - _timeline.origin).count() + << "],"; + + // name + os << "\"name\":\""; + if(s.name.empty()) { + os << w << '_' << i; + } + else { + os << s.name; + } + os << "\","; + + // e.g., category "type": "Condition Task" + os << "\"type\":\"" << to_string(s.type) << "\""; + + os << "}"; + } + os << "]}"; + } + } + + os << "]}\n"; +} + +// Function: dump +inline std::string TFProfObserver::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Procedure: summary +inline void TFProfObserver::summary(std::ostream& os) const { + + using namespace std::chrono; + + Summary summary; + std::optional<observer_stamp_t> view_beg, view_end; + + // find the first non-empty worker + size_t first; + for(first = 0; first<_timeline.segments.size(); ++first) { + if(_timeline.segments[first].size() > 0) { + break; + } + } + + // not timeline data to dump + if(first == _timeline.segments.size()) { + goto end_of_summary; + } + + for(size_t w=first; w<_timeline.segments.size(); w++) { + for(size_t l=0; l<_timeline.segments[w].size(); l++) { + + if(_timeline.segments[w][l].empty()) { + continue; + } + + // worker w at level l + WorkerSummary ws; + ws.id = w; + ws.level = l; + ws.count = _timeline.segments[w][l].size(); + + // scan all tasks at level l + for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) { + + // update the entire span + auto& s = _timeline.segments[w][l][i]; + view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg; + view_end = view_end ? std::max(*view_end, s.end) : s.end; + + // update the task summary + size_t t = duration_cast<microseconds>(s.end - s.beg).count(); + + auto& x = summary.tsum[static_cast<int>(s.type)]; + x.count += 1; + x.total_span += t; + x.min_span = (x.count == 1) ? t : std::min(t, x.min_span); + x.max_span = (x.count == 1) ? t : std::max(t, x.max_span); + + // update the worker summary + ws.total_span += t; + ws.min_span = (i == 0) ? t : std::min(t, ws.min_span); + ws.max_span = (i == 0) ? t : std::max(t, ws.max_span); + + auto&y = ws.tsum[static_cast<int>(s.type)]; + y.count += 1; + y.total_span += t; + y.min_span = (y.count == 1) ? t : std::min(t, y.min_span); + y.max_span = (y.count == 1) ? t : std::max(t, y.max_span); + + // update the delay + //if(i) { + // size_t d = duration_cast<nanoseconds>( + // s.beg - _timeline.segments[w][l][i-1].end + // ).count(); + // ws.total_delay += d; + // ws.min_delay = (i == 1) ? d : std::min(ws.min_delay, d); + // ws.max_delay = (i == 1) ? d : std::max(ws.max_delay, d); + //} + } + summary.wsum.push_back(ws); + } + } + + end_of_summary: + + size_t view = 0; + if(view_beg && view_end) { + view = duration_cast<microseconds>(*view_end - *view_beg).count(); + } + + os << "==Observer " << _timeline.uid << ": " + << num_workers() << " workers completed " + << num_tasks() << " tasks in " + << view << " us\n"; + + summary.dump(os); +} + +// Procedure: summary +inline std::string TFProfObserver::summary() const { + std::ostringstream oss; + summary(oss); + return oss.str(); +} + +// Function: num_tasks +inline size_t TFProfObserver::num_tasks() const { + size_t s = 0; + for(size_t w=0; w<_timeline.segments.size(); ++w) { + for(size_t l=0; l<_timeline.segments[w].size(); ++l) { + s += _timeline.segments[w][l].size(); + } + } + return s; +} + +// Function: num_workers +inline size_t TFProfObserver::num_workers() const { + size_t w = 0; + for(size_t i=0; i<_timeline.segments.size(); ++i) { + w += (!_timeline.segments[i].empty()); + } + return w; +} + + +// ---------------------------------------------------------------------------- +// TFProfManager +// ---------------------------------------------------------------------------- + +/** +@private +*/ +class TFProfManager { + + friend class Executor; + + public: + + ~TFProfManager(); + + TFProfManager(const TFProfManager&) = delete; + TFProfManager& operator=(const TFProfManager&) = delete; + + static TFProfManager& get(); + + void dump(std::ostream& ostream) const; + + private: + + const std::string _fpath; + + std::mutex _mutex; + std::vector<std::shared_ptr<TFProfObserver>> _observers; + + TFProfManager(); + + void _manage(std::shared_ptr<TFProfObserver> observer); +}; + +// constructor +inline TFProfManager::TFProfManager() : + _fpath {get_env(TF_ENABLE_PROFILER)} { + +} + +// Procedure: manage +inline void TFProfManager::_manage(std::shared_ptr<TFProfObserver> observer) { + std::lock_guard lock(_mutex); + _observers.push_back(std::move(observer)); +} + +// Procedure: dump +inline void TFProfManager::dump(std::ostream& os) const { + for(size_t i=0; i<_observers.size(); ++i) { + if(i) os << ','; + _observers[i]->dump(os); + } +} + +// Destructor +inline TFProfManager::~TFProfManager() { + std::ofstream ofs(_fpath); + if(ofs) { + // .tfp + if(_fpath.rfind(".tfp") != std::string::npos) { + ProfileData data; + data.timelines.reserve(_observers.size()); + for(size_t i=0; i<_observers.size(); ++i) { + data.timelines.push_back(std::move(_observers[i]->_timeline)); + } + Serializer<std::ofstream> serializer(ofs); + serializer(data); + } + // .json + else { // if(_fpath.rfind(".json") != std::string::npos) { + ofs << "[\n"; + for(size_t i=0; i<_observers.size(); ++i) { + if(i) ofs << ','; + _observers[i]->dump(ofs); + } + ofs << "]\n"; + } + } + // do a summary report in stderr for each observer + else { + std::ostringstream oss; + for(size_t i=0; i<_observers.size(); ++i) { + _observers[i]->summary(oss); + } + fprintf(stderr, "%s", oss.str().c_str()); + } +} + +// Function: get +inline TFProfManager& TFProfManager::get() { + static TFProfManager mgr; + return mgr; +} + +// ---------------------------------------------------------------------------- +// Identifier for Each Built-in Observer +// ---------------------------------------------------------------------------- + +/** @enum ObserverType + +@brief enumeration of all observer types + +*/ +enum class ObserverType : int { + TFPROF = 0, + CHROME, + UNDEFINED +}; + +/** +@brief convert an observer type to a human-readable string +*/ +inline const char* to_string(ObserverType type) { + switch(type) { + case ObserverType::TFPROF: return "tfprof"; + case ObserverType::CHROME: return "chrome"; + default: return "undefined"; + } +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/core/semaphore.hpp b/myxpcs/include/taskflow_/core/semaphore.hpp new file mode 100644 index 0000000..12d6069 --- /dev/null +++ b/myxpcs/include/taskflow_/core/semaphore.hpp @@ -0,0 +1,132 @@ +#pragma once + +#include <vector> +#include <mutex> + +#include "declarations.hpp" + +/** +@file semaphore.hpp +@brief semaphore include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Semaphore +// ---------------------------------------------------------------------------- + +/** +@class Semaphore + +@brief class to create a semophore object for building a concurrency constraint + +A semaphore creates a constraint that limits the maximum concurrency, +i.e., the number of workers, in a set of tasks. +You can let a task acquire/release one or multiple semaphores before/after +executing its work. +A task can acquire and release a semaphore, +or just acquire or just release it. +A tf::Semaphore object starts with an initial count. +As long as that count is above 0, tasks can acquire the semaphore and do +their work. +If the count is 0 or less, a task trying to acquire the semaphore will not run +but goes to a waiting list of that semaphore. +When the semaphore is released by another task, +it reschedules all tasks on that waiting list. + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; + +tf::Semaphore semaphore(1); // create a semaphore with initial count 1 + +std::vector<tf::Task> tasks { + taskflow.emplace([](){ std::cout << "A" << std::endl; }), + taskflow.emplace([](){ std::cout << "B" << std::endl; }), + taskflow.emplace([](){ std::cout << "C" << std::endl; }), + taskflow.emplace([](){ std::cout << "D" << std::endl; }), + taskflow.emplace([](){ std::cout << "E" << std::endl; }) +}; + +for(auto & task : tasks) { // each task acquires and release the semaphore + task.acquire(semaphore); + task.release(semaphore); +} + +executor.run(taskflow).wait(); +@endcode + +The above example creates five tasks with no dependencies between them. +Under normal circumstances, the five tasks would be executed concurrently. +However, this example has a semaphore with initial count 1, +and all tasks need to acquire that semaphore before running and release that +semaphore after they are done. +This arrangement limits the number of concurrently running tasks to only one. + +*/ +class Semaphore { + + friend class Node; + + public: + + /** + @brief constructs a semaphore with the given counter + + A semaphore creates a constraint that limits the maximum concurrency, + i.e., the number of workers, in a set of tasks. + + @code{.cpp} + tf::Semaphore semaphore(4); // concurrency constraint of 4 workers + @endcode + */ + explicit Semaphore(size_t max_workers); + + /** + @brief queries the counter value (not thread-safe during the run) + */ + size_t count() const; + + private: + + std::mutex _mtx; + + size_t _counter; + + std::vector<Node*> _waiters; + + bool _try_acquire_or_wait(Node*); + + std::vector<Node*> _release(); +}; + +inline Semaphore::Semaphore(size_t max_workers) : + _counter(max_workers) { +} + +inline bool Semaphore::_try_acquire_or_wait(Node* me) { + std::lock_guard<std::mutex> lock(_mtx); + if(_counter > 0) { + --_counter; + return true; + } + else { + _waiters.push_back(me); + return false; + } +} + +inline std::vector<Node*> Semaphore::_release() { + std::lock_guard<std::mutex> lock(_mtx); + ++_counter; + std::vector<Node*> r{std::move(_waiters)}; + return r; +} + +inline size_t Semaphore::count() const { + return _counter; +} + +} // end of namespace tf. --------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/core/task.hpp b/myxpcs/include/taskflow_/core/task.hpp new file mode 100644 index 0000000..f69d9a6 --- /dev/null +++ b/myxpcs/include/taskflow_/core/task.hpp @@ -0,0 +1,776 @@ +#pragma once + +#include "graph.hpp" + +/** +@file task.hpp +@brief task include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Task Types +// ---------------------------------------------------------------------------- + +/** +@enum TaskType + +@brief enumeration of all task types +*/ +enum class TaskType : int { + /** @brief placeholder task type */ + PLACEHOLDER = 0, + /** @brief static task type */ + STATIC, + /** @brief dynamic (subflow) task type */ + DYNAMIC, + /** @brief condition task type */ + CONDITION, + /** @brief module task type */ + MODULE, + /** @brief asynchronous task type */ + ASYNC, + /** @brief undefined task type (for internal use only) */ + UNDEFINED +}; + +/** +@private +@brief array of all task types (used for iterating task types) +*/ +inline constexpr std::array<TaskType, 6> TASK_TYPES = { + TaskType::PLACEHOLDER, + TaskType::STATIC, + TaskType::DYNAMIC, + TaskType::CONDITION, + TaskType::MODULE, + TaskType::ASYNC, +}; + +/** +@brief convert a task type to a human-readable string + +The name of each task type is the litte-case string of its characters. + +@code{.cpp} +TaskType::PLACEHOLDER -> "placeholder" +TaskType::STATIC -> "static" +TaskType::DYNAMIC -> "subflow" +TaskType::CONDITION -> "condition" +TaskType::MODULE -> "module" +TaskType::ASYNC -> "async" +@endcode +*/ +inline const char* to_string(TaskType type) { + + const char* val; + + switch(type) { + case TaskType::PLACEHOLDER: val = "placeholder"; break; + case TaskType::STATIC: val = "static"; break; + case TaskType::DYNAMIC: val = "subflow"; break; + case TaskType::CONDITION: val = "condition"; break; + case TaskType::MODULE: val = "module"; break; + case TaskType::ASYNC: val = "async"; break; + default: val = "undefined"; break; + } + + return val; +} + +// ---------------------------------------------------------------------------- +// Task Traits +// ---------------------------------------------------------------------------- + +/** +@brief determines if a callable is a dynamic task + +A dynamic task is a callable object constructible from std::function<void(Subflow&)>. +*/ +template <typename C> +constexpr bool is_dynamic_task_v = + std::is_invocable_r_v<void, C, Subflow&> && + !std::is_invocable_r_v<void, C, Runtime&>; + +/** +@brief determines if a callable is a condition task + +A condition task is a callable object constructible from std::function<int()> +or std::function<int(tf::Runtime&)>. +*/ +template <typename C> +constexpr bool is_condition_task_v = + (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) && + !is_dynamic_task_v<C>; + +/** +@brief determines if a callable is a multi-condition task + +A multi-condition task is a callable object constructible from +std::function<tf::SmallVector<int>()> or +std::function<tf::SmallVector<int>(tf::Runtime&)>. +*/ +template <typename C> +constexpr bool is_multi_condition_task_v = + (std::is_invocable_r_v<SmallVector<int>, C> || + std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) && + !is_dynamic_task_v<C>; + +/** +@brief determines if a callable is a static task + +A static task is a callable object constructible from std::function<void()> +or std::function<void(tf::Runtime&)>. +*/ +template <typename C> +constexpr bool is_static_task_v = + (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) && + !is_condition_task_v<C> && + !is_multi_condition_task_v<C> && + !is_dynamic_task_v<C>; + +// ---------------------------------------------------------------------------- +// Task +// ---------------------------------------------------------------------------- + +/** +@class Task + +@brief class to create a task handle over a node in a taskflow graph + +A task is a wrapper over a node in a taskflow graph. +It provides a set of methods for users to access and modify the attributes of +the associated node in the taskflow graph. +A task is very lightweight object (i.e., only storing a node pointer) that +can be trivially copied around, +and it does not own the lifetime of the associated node. +*/ +class Task { + + friend class FlowBuilder; + friend class Runtime; + friend class Taskflow; + friend class TaskView; + friend class Executor; + + public: + + /** + @brief constructs an empty task + */ + Task() = default; + + /** + @brief constructs the task with the copy of the other task + */ + Task(const Task& other); + + /** + @brief replaces the contents with a copy of the other task + */ + Task& operator = (const Task&); + + /** + @brief replaces the contents with a null pointer + */ + Task& operator = (std::nullptr_t); + + /** + @brief compares if two tasks are associated with the same graph node + */ + bool operator == (const Task& rhs) const; + + /** + @brief compares if two tasks are not associated with the same graph node + */ + bool operator != (const Task& rhs) const; + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors of the task + */ + size_t num_successors() const; + + /** + @brief queries the number of predecessors of the task + */ + size_t num_dependents() const; + + /** + @brief queries the number of strong dependents of the task + */ + size_t num_strong_dependents() const; + + /** + @brief queries the number of weak dependents of the task + */ + size_t num_weak_dependents() const; + + /** + @brief assigns a name to the task + + @param name a @std_string acceptable string + + @return @c *this + */ + Task& name(const std::string& name); + + /** + @brief assigns a callable + + @tparam C callable type + + @param callable callable to construct a task + + @return @c *this + */ + template <typename C> + Task& work(C&& callable); + + /** + @brief creates a module task from a taskflow + + @tparam T object type + @param object a custom object that defines @c T::graph() method + + @return @c *this + */ + template <typename T> + Task& composed_of(T& object); + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + Task& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + Task& succeed(Ts&&... tasks); + + /** + @brief makes the task release this semaphore + */ + Task& release(Semaphore& semaphore); + + /** + @brief makes the task acquire this semaphore + */ + Task& acquire(Semaphore& semaphore); + + /** + @brief assigns pointer to user data + + @param data pointer to user data + + The following example shows how to attach user data to a task and + run the task iteratively while changing the data value: + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow("attach data to a task"); + + int data; + + // create a task and attach it the data + auto A = taskflow.placeholder(); + A.data(&data).work([A](){ + auto d = *static_cast<int*>(A.data()); + std::cout << "data is " << d << std::endl; + }); + + // run the taskflow iteratively with changing data + for(data = 0; data<10; data++){ + executor.run(taskflow).wait(); + } + @endcode + + @return @c *this + */ + Task& data(void* data); + + /** + @brief assigns a priority value to the task + + A priority value can be one of the following three levels, + tf::TaskPriority::HIGH (numerically equivalent to 0), + tf::TaskPriority::NORMAL (numerically equivalent to 1), and + tf::TaskPriority::LOW (numerically equivalent to 2). + The smaller the priority value, the higher the priority. + */ + Task& priority(TaskPriority p); + + /** + @brief queries the priority value of the task + */ + TaskPriority priority() const; + + /** + @brief resets the task handle to null + */ + void reset(); + + /** + @brief resets the associated work to a placeholder + */ + void reset_work(); + + /** + @brief queries if the task handle points to a task node + */ + bool empty() const; + + /** + @brief queries if the task has a work assigned + */ + bool has_work() const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template <typename V> + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template <typename V> + void for_each_dependent(V&& visitor) const; + + /** + @brief obtains a hash value of the underlying node + */ + size_t hash_value() const; + + /** + @brief returns the task type + */ + TaskType type() const; + + /** + @brief dumps the task through an output stream + */ + void dump(std::ostream& ostream) const; + + /** + @brief queries pointer to user data + */ + void* data() const; + + + private: + + Task(Node*); + + Node* _node {nullptr}; +}; + +// Constructor +inline Task::Task(Node* node) : _node {node} { +} + +// Constructor +inline Task::Task(const Task& rhs) : _node {rhs._node} { +} + +// Function: precede +template <typename... Ts> +Task& Task::precede(Ts&&... tasks) { + (_node->_precede(tasks._node), ...); + //_precede(std::forward<Ts>(tasks)...); + return *this; +} + +// Function: succeed +template <typename... Ts> +Task& Task::succeed(Ts&&... tasks) { + (tasks._node->_precede(_node), ...); + //_succeed(std::forward<Ts>(tasks)...); + return *this; +} + +// Function: composed_of +template <typename T> +Task& Task::composed_of(T& object) { + _node->_handle.emplace<Node::Module>(object); + return *this; +} + +// Operator = +inline Task& Task::operator = (const Task& rhs) { + _node = rhs._node; + return *this; +} + +// Operator = +inline Task& Task::operator = (std::nullptr_t ptr) { + _node = ptr; + return *this; +} + +// Operator == +inline bool Task::operator == (const Task& rhs) const { + return _node == rhs._node; +} + +// Operator != +inline bool Task::operator != (const Task& rhs) const { + return _node != rhs._node; +} + +// Function: name +inline Task& Task::name(const std::string& name) { + _node->_name = name; + return *this; +} + +// Function: acquire +inline Task& Task::acquire(Semaphore& s) { + if(!_node->_semaphores) { + _node->_semaphores = std::make_unique<Node::Semaphores>(); + } + _node->_semaphores->to_acquire.push_back(&s); + return *this; +} + +// Function: release +inline Task& Task::release(Semaphore& s) { + if(!_node->_semaphores) { + //_node->_semaphores.emplace(); + _node->_semaphores = std::make_unique<Node::Semaphores>(); + } + _node->_semaphores->to_release.push_back(&s); + return *this; +} + +// Procedure: reset +inline void Task::reset() { + _node = nullptr; +} + +// Procedure: reset_work +inline void Task::reset_work() { + _node->_handle.emplace<std::monostate>(); +} + +// Function: name +inline const std::string& Task::name() const { + return _node->_name; +} + +// Function: num_dependents +inline size_t Task::num_dependents() const { + return _node->num_dependents(); +} + +// Function: num_strong_dependents +inline size_t Task::num_strong_dependents() const { + return _node->num_strong_dependents(); +} + +// Function: num_weak_dependents +inline size_t Task::num_weak_dependents() const { + return _node->num_weak_dependents(); +} + +// Function: num_successors +inline size_t Task::num_successors() const { + return _node->num_successors(); +} + +// Function: empty +inline bool Task::empty() const { + return _node == nullptr; +} + +// Function: has_work +inline bool Task::has_work() const { + return _node ? _node->_handle.index() != 0 : false; +} + +// Function: task_type +inline TaskType Task::type() const { + switch(_node->_handle.index()) { + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MULTI_CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::DEPENDENT_ASYNC: return TaskType::ASYNC; + default: return TaskType::UNDEFINED; + } +} + +// Function: for_each_successor +template <typename V> +void Task::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node->_successors.size(); ++i) { + visitor(Task(_node->_successors[i])); + } +} + +// Function: for_each_dependent +template <typename V> +void Task::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node->_dependents.size(); ++i) { + visitor(Task(_node->_dependents[i])); + } +} + +// Function: hash_value +inline size_t Task::hash_value() const { + return std::hash<Node*>{}(_node); +} + +// Procedure: dump +inline void Task::dump(std::ostream& os) const { + os << "task "; + if(name().empty()) os << _node; + else os << name(); + os << " [type=" << to_string(type()) << ']'; +} + +// Function: work +template <typename C> +Task& Task::work(C&& c) { + + if constexpr(is_static_task_v<C>) { + _node->_handle.emplace<Node::Static>(std::forward<C>(c)); + } + else if constexpr(is_dynamic_task_v<C>) { + _node->_handle.emplace<Node::Dynamic>(std::forward<C>(c)); + } + else if constexpr(is_condition_task_v<C>) { + _node->_handle.emplace<Node::Condition>(std::forward<C>(c)); + } + else if constexpr(is_multi_condition_task_v<C>) { + _node->_handle.emplace<Node::MultiCondition>(std::forward<C>(c)); + } + else { + static_assert(dependent_false_v<C>, "invalid task callable"); + } + return *this; +} + +// Function: data +inline void* Task::data() const { + return _node->_data; +} + +// Function: data +inline Task& Task::data(void* data) { + _node->_data = data; + return *this; +} + +// Function: priority +inline Task& Task::priority(TaskPriority p) { + _node->_priority = static_cast<unsigned>(p); + return *this; +} + +// Function: priority +inline TaskPriority Task::priority() const { + return static_cast<TaskPriority>(_node->_priority); +} + +// ---------------------------------------------------------------------------- +// global ostream +// ---------------------------------------------------------------------------- + +/** +@brief overload of ostream inserter operator for Task +*/ +inline std::ostream& operator << (std::ostream& os, const Task& task) { + task.dump(os); + return os; +} + +// ---------------------------------------------------------------------------- +// Task View +// ---------------------------------------------------------------------------- + +/** +@class TaskView + +@brief class to access task information from the observer interface +*/ +class TaskView { + + friend class Executor; + + public: + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors of the task + */ + size_t num_successors() const; + + /** + @brief queries the number of predecessors of the task + */ + size_t num_dependents() const; + + /** + @brief queries the number of strong dependents of the task + */ + size_t num_strong_dependents() const; + + /** + @brief queries the number of weak dependents of the task + */ + size_t num_weak_dependents() const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template <typename V> + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template <typename V> + void for_each_dependent(V&& visitor) const; + + /** + @brief queries the task type + */ + TaskType type() const; + + /** + @brief obtains a hash value of the underlying node + */ + size_t hash_value() const; + + private: + + TaskView(const Node&); + TaskView(const TaskView&) = default; + + const Node& _node; +}; + +// Constructor +inline TaskView::TaskView(const Node& node) : _node {node} { +} + +// Function: name +inline const std::string& TaskView::name() const { + return _node._name; +} + +// Function: num_dependents +inline size_t TaskView::num_dependents() const { + return _node.num_dependents(); +} + +// Function: num_strong_dependents +inline size_t TaskView::num_strong_dependents() const { + return _node.num_strong_dependents(); +} + +// Function: num_weak_dependents +inline size_t TaskView::num_weak_dependents() const { + return _node.num_weak_dependents(); +} + +// Function: num_successors +inline size_t TaskView::num_successors() const { + return _node.num_successors(); +} + +// Function: type +inline TaskType TaskView::type() const { + switch(_node._handle.index()) { + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MULTI_CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::DEPENDENT_ASYNC: return TaskType::ASYNC; + default: return TaskType::UNDEFINED; + } +} + +// Function: hash_value +inline size_t TaskView::hash_value() const { + return std::hash<const Node*>{}(&_node); +} + +// Function: for_each_successor +template <typename V> +void TaskView::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node._successors.size(); ++i) { + visitor(TaskView(*_node._successors[i])); + } +} + +// Function: for_each_dependent +template <typename V> +void TaskView::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node._dependents.size(); ++i) { + visitor(TaskView(*_node._dependents[i])); + } +} + +} // end of namespace tf. --------------------------------------------------- + +namespace std { + +/** +@struct hash + +@brief hash specialization for std::hash<tf::Task> +*/ +template <> +struct hash<tf::Task> { + auto operator() (const tf::Task& task) const noexcept { + return task.hash_value(); + } +}; + +/** +@struct hash + +@brief hash specialization for std::hash<tf::TaskView> +*/ +template <> +struct hash<tf::TaskView> { + auto operator() (const tf::TaskView& task_view) const noexcept { + return task_view.hash_value(); + } +}; + +} // end of namespace std ---------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/core/taskflow.hpp b/myxpcs/include/taskflow_/core/taskflow.hpp new file mode 100644 index 0000000..b34381d --- /dev/null +++ b/myxpcs/include/taskflow_/core/taskflow.hpp @@ -0,0 +1,643 @@ +#pragma once + +#include "flow_builder.hpp" + +/** +@file taskflow/core/taskflow.hpp +@brief taskflow include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- + +/** +@class Taskflow + +@brief class to create a taskflow object + +A %taskflow manages a task dependency graph where each task represents a +callable object (e.g., @std_lambda, @std_function) and an edge represents a +dependency between two tasks. A task is one of the following types: + + 1. static task : the callable constructible from + @c std::function<void()> + 2. dynamic task : the callable constructible from + @c std::function<void(tf::Subflow&)> + 3. condition task : the callable constructible from + @c std::function<int()> + 4. multi-condition task: the callable constructible from + @c %std::function<tf::SmallVector<int>()> + 5. module task : the task constructed from tf::Taskflow::composed_of + @c std::function<void(tf::Runtime&)> + +Each task is a basic computation unit and is run by one worker thread +from an executor. +The following example creates a simple taskflow graph of four static tasks, +@c A, @c B, @c C, and @c D, where +@c A runs before @c B and @c C and +@c D runs after @c B and @c C. + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow("simple"); + +tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; }); +tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; }); +tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; }); +tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; }); + +A.precede(B, C); // A runs before B and C +D.succeed(B, C); // D runs after B and C + +executor.run(taskflow).wait(); +@endcode + +The taskflow object itself is NOT thread-safe. You should not +modifying the graph while it is running, +such as adding new tasks, adding new dependencies, and moving +the taskflow to another. +To minimize the overhead of task creation, +our runtime leverages a global object pool to recycle +tasks in a thread-safe manner. + +Please refer to @ref Cookbook to learn more about each task type +and how to submit a taskflow to an executor. +*/ +class Taskflow : public FlowBuilder { + + friend class Topology; + friend class Executor; + friend class FlowBuilder; + + struct Dumper { + size_t id; + std::stack<std::pair<const Node*, const Graph*>> stack; + std::unordered_map<const Graph*, size_t> visited; + }; + + public: + + /** + @brief constructs a taskflow with the given name + + @code{.cpp} + tf::Taskflow taskflow("My Taskflow"); + std::cout << taskflow.name(); // "My Taskflow" + @endcode + */ + Taskflow(const std::string& name); + + /** + @brief constructs a taskflow + */ + Taskflow(); + + /** + @brief constructs a taskflow from a moved taskflow + + Constructing a taskflow @c taskflow1 from a moved taskflow @c taskflow2 will + migrate the graph of @c taskflow2 to @c taskflow1. + After the move, @c taskflow2 will become empty. + + @code{.cpp} + tf::Taskflow taskflow1(std::move(taskflow2)); + assert(taskflow2.empty()); + @endcode + + Notice that @c taskflow2 should not be running in an executor + during the move operation, or the behavior is undefined. + */ + Taskflow(Taskflow&& rhs); + + /** + @brief move assignment operator + + Moving a taskflow @c taskflow2 to another taskflow @c taskflow1 will destroy + the existing graph of @c taskflow1 and assign it the graph of @c taskflow2. + After the move, @c taskflow2 will become empty. + + @code{.cpp} + taskflow1 = std::move(taskflow2); + assert(taskflow2.empty()); + @endcode + + Notice that both @c taskflow1 and @c taskflow2 should not be running + in an executor during the move operation, or the behavior is undefined. + */ + Taskflow& operator = (Taskflow&& rhs); + + /** + @brief default destructor + + When the destructor is called, all tasks and their associated data + (e.g., captured data) will be destroyed. + It is your responsibility to ensure all submitted execution of this + taskflow have completed before destroying it. + For instance, the following code results in undefined behavior + since the executor may still be running the taskflow while + it is destroyed after the block. + + @code{.cpp} + { + tf::Taskflow taskflow; + executor.run(taskflow); + } + @endcode + + To fix the problem, we must wait for the execution to complete + before destroying the taskflow. + + @code{.cpp} + { + tf::Taskflow taskflow; + executor.run(taskflow).wait(); + } + @endcode + */ + ~Taskflow() = default; + + /** + @brief dumps the taskflow to a DOT format through a std::ostream target + + @code{.cpp} + taskflow.dump(std::cout); // dump the graph to the standard output + + std::ofstream ofs("output.dot"); + taskflow.dump(ofs); // dump the graph to the file output.dot + @endcode + + For dynamically spawned tasks, such as module tasks, subflow tasks, + and GPU tasks, you need to run the taskflow first before you can + dump the entire graph. + + @code{.cpp} + tf::Task parent = taskflow.emplace([](tf::Subflow sf){ + sf.emplace([](){ std::cout << "child\n"; }); + }); + taskflow.dump(std::cout); // this dumps only the parent tasks + executor.run(taskflow).wait(); + taskflow.dump(std::cout); // this dumps both parent and child tasks + @endcode + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the taskflow to a std::string of DOT format + + This method is similar to tf::Taskflow::dump(std::ostream& ostream), + but returning a string of the graph in DOT format. + */ + std::string dump() const; + + /** + @brief queries the number of tasks + */ + size_t num_tasks() const; + + /** + @brief queries the emptiness of the taskflow + + An empty taskflow has no tasks. That is the return of + tf::Taskflow::num_tasks is zero. + */ + bool empty() const; + + /** + @brief assigns a name to the taskflow + + @code{.cpp} + taskflow.name("assign another name"); + @endcode + */ + void name(const std::string&); + + /** + @brief queries the name of the taskflow + + @code{.cpp} + std::cout << "my name is: " << taskflow.name(); + @endcode + */ + const std::string& name() const; + + /** + @brief clears the associated task dependency graph + + When you clear a taskflow, all tasks and their associated data + (e.g., captured data in task callables) will be destroyed. + The behavior of clearing a running taskflow is undefined. + */ + void clear(); + + /** + @brief applies a visitor to each task in the taskflow + + A visitor is a callable that takes an argument of type tf::Task + and returns nothing. The following example iterates each task in a + taskflow and prints its name: + + @code{.cpp} + taskflow.for_each_task([](tf::Task task){ + std::cout << task.name() << '\n'; + }); + @endcode + */ + template <typename V> + void for_each_task(V&& visitor) const; + + /** + @brief removes dependencies that go from task @c from to task @c to + + @param from from task (dependent) + @param to to task (successor) + + @code{.cpp} + tf::Taskflow taskflow; + auto a = taskflow.placeholder().name("a"); + auto b = taskflow.placeholder().name("b"); + auto c = taskflow.placeholder().name("c"); + auto d = taskflow.placeholder().name("d"); + + a.precede(b, c, d); + assert(a.num_successors() == 3); + assert(b.num_dependents() == 1); + assert(c.num_dependents() == 1); + assert(d.num_dependents() == 1); + + taskflow.remove_dependency(a, b); + assert(a.num_successors() == 2); + assert(b.num_dependents() == 0); + @endcode + */ + inline void remove_dependency(Task from, Task to); + + /** + @brief returns a reference to the underlying graph object + + A graph object (of type tf::Graph) is the ultimate storage for the + task dependency graph and should only be used as an opaque + data structure to interact with the executor (e.g., composition). + */ + Graph& graph(); + + private: + + mutable std::mutex _mutex; + + std::string _name; + + Graph _graph; + + std::queue<std::shared_ptr<Topology>> _topologies; + std::optional<std::list<Taskflow>::iterator> _satellite; + + void _dump(std::ostream&, const Graph*) const; + void _dump(std::ostream&, const Node*, Dumper&) const; + void _dump(std::ostream&, const Graph*, Dumper&) const; +}; + +// Constructor +inline Taskflow::Taskflow(const std::string& name) : + FlowBuilder {_graph}, + _name {name} { +} + +// Constructor +inline Taskflow::Taskflow() : FlowBuilder{_graph} { +} + +// Move constructor +inline Taskflow::Taskflow(Taskflow&& rhs) : FlowBuilder{_graph} { + + std::scoped_lock<std::mutex> lock(rhs._mutex); + + _name = std::move(rhs._name); + _graph = std::move(rhs._graph); + _topologies = std::move(rhs._topologies); + _satellite = rhs._satellite; + + rhs._satellite.reset(); +} + +// Move assignment +inline Taskflow& Taskflow::operator = (Taskflow&& rhs) { + if(this != &rhs) { + std::scoped_lock<std::mutex, std::mutex> lock(_mutex, rhs._mutex); + _name = std::move(rhs._name); + _graph = std::move(rhs._graph); + _topologies = std::move(rhs._topologies); + _satellite = rhs._satellite; + rhs._satellite.reset(); + } + return *this; +} + +// Procedure: +inline void Taskflow::clear() { + _graph._clear(); +} + +// Function: num_tasks +inline size_t Taskflow::num_tasks() const { + return _graph.size(); +} + +// Function: empty +inline bool Taskflow::empty() const { + return _graph.empty(); +} + +// Function: name +inline void Taskflow::name(const std::string &name) { + _name = name; +} + +// Function: name +inline const std::string& Taskflow::name() const { + return _name; +} + +// Function: graph +inline Graph& Taskflow::graph() { + return _graph; +} + +// Function: for_each_task +template <typename V> +void Taskflow::for_each_task(V&& visitor) const { + for(size_t i=0; i<_graph._nodes.size(); ++i) { + visitor(Task(_graph._nodes[i])); + } +} + +// Procedure: remove_dependency +inline void Taskflow::remove_dependency(Task from, Task to) { + from._node->_successors.erase(std::remove_if( + from._node->_successors.begin(), from._node->_successors.end(), [&](Node* i){ + return i == to._node; + } + ), from._node->_successors.end()); + + to._node->_dependents.erase(std::remove_if( + to._node->_dependents.begin(), to._node->_dependents.end(), [&](Node* i){ + return i == from._node; + } + ), to._node->_dependents.end()); +} + +// Procedure: dump +inline std::string Taskflow::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Function: dump +inline void Taskflow::dump(std::ostream& os) const { + os << "digraph Taskflow {\n"; + _dump(os, &_graph); + os << "}\n"; +} + +// Procedure: _dump +inline void Taskflow::_dump(std::ostream& os, const Graph* top) const { + + Dumper dumper; + + dumper.id = 0; + dumper.stack.push({nullptr, top}); + dumper.visited[top] = dumper.id++; + + while(!dumper.stack.empty()) { + + auto [p, f] = dumper.stack.top(); + dumper.stack.pop(); + + os << "subgraph cluster_p" << f << " {\nlabel=\""; + + // n-level module + if(p) { + os << 'm' << dumper.visited[f]; + } + // top-level taskflow graph + else { + os << "Taskflow: "; + if(_name.empty()) os << 'p' << this; + else os << _name; + } + + os << "\";\n"; + + _dump(os, f, dumper); + os << "}\n"; + } +} + +// Procedure: _dump +inline void Taskflow::_dump( + std::ostream& os, const Node* node, Dumper& dumper +) const { + + os << 'p' << node << "[label=\""; + if(node->_name.empty()) os << 'p' << node; + else os << node->_name; + os << "\" "; + + // shape for node + switch(node->_handle.index()) { + + case Node::CONDITION: + case Node::MULTI_CONDITION: + os << "shape=diamond color=black fillcolor=aquamarine style=filled"; + break; + + default: + break; + } + + os << "];\n"; + + for(size_t s=0; s<node->_successors.size(); ++s) { + if(node->_is_conditioner()) { + // case edge is dashed + os << 'p' << node << " -> p" << node->_successors[s] + << " [style=dashed label=\"" << s << "\"];\n"; + } else { + os << 'p' << node << " -> p" << node->_successors[s] << ";\n"; + } + } + + // subflow join node + if(node->_parent && node->_parent->_handle.index() == Node::DYNAMIC && + node->_successors.size() == 0 + ) { + os << 'p' << node << " -> p" << node->_parent << ";\n"; + } + + // node info + switch(node->_handle.index()) { + + case Node::DYNAMIC: { + auto& sbg = std::get_if<Node::Dynamic>(&node->_handle)->subgraph; + if(!sbg.empty()) { + os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: "; + if(node->_name.empty()) os << 'p' << node; + else os << node->_name; + + os << "\";\n" << "color=blue\n"; + _dump(os, &sbg, dumper); + os << "}\n"; + } + } + break; + + default: + break; + } +} + +// Procedure: _dump +inline void Taskflow::_dump( + std::ostream& os, const Graph* graph, Dumper& dumper +) const { + + for(const auto& n : graph->_nodes) { + + // regular task + if(n->_handle.index() != Node::MODULE) { + _dump(os, n, dumper); + } + // module task + else { + //auto module = &(std::get_if<Node::Module>(&n->_handle)->module); + auto module = &(std::get_if<Node::Module>(&n->_handle)->graph); + + os << 'p' << n << "[shape=box3d, color=blue, label=\""; + if(n->_name.empty()) os << 'p' << n; + else os << n->_name; + + if(dumper.visited.find(module) == dumper.visited.end()) { + dumper.visited[module] = dumper.id++; + dumper.stack.push({n, module}); + } + + os << " [m" << dumper.visited[module] << "]\"];\n"; + + for(const auto s : n->_successors) { + os << 'p' << n << "->" << 'p' << s << ";\n"; + } + } + } +} + +// ---------------------------------------------------------------------------- +// class definition: Future +// ---------------------------------------------------------------------------- + +/** +@class Future + +@brief class to access the result of an execution + +tf::Future is a derived class from std::future that will eventually hold the +execution result of a submitted taskflow (tf::Executor::run) +In addition to the base methods inherited from std::future, +you can call tf::Future::cancel to cancel the execution of the running taskflow +associated with this future object. +The following example cancels a submission of a taskflow that contains +1000 tasks each running one second. + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow; + +for(int i=0; i<1000; i++) { + taskflow.emplace([](){ + std::this_thread::sleep_for(std::chrono::seconds(1)); + }); +} + +// submit the taskflow +tf::Future fu = executor.run(taskflow); + +// request to cancel the submitted execution above +fu.cancel(); + +// wait until the cancellation finishes +fu.get(); +@endcode +*/ +template <typename T> +class Future : public std::future<T> { + + friend class Executor; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief default constructor + */ + Future() = default; + + /** + @brief disabled copy constructor + */ + Future(const Future&) = delete; + + /** + @brief default move constructor + */ + Future(Future&&) = default; + + /** + @brief disabled copy assignment + */ + Future& operator = (const Future&) = delete; + + /** + @brief default move assignment + */ + Future& operator = (Future&&) = default; + + /** + @brief cancels the execution of the running taskflow associated with + this future object + + @return @c true if the execution can be cancelled or + @c false if the execution has already completed + + When you request a cancellation, the executor will stop scheduling + any tasks onwards. Tasks that are already running will continue to finish + (non-preemptive). + You can call tf::Future::wait to wait for the cancellation to complete. + */ + bool cancel(); + + private: + + std::weak_ptr<Topology> _topology; + + Future(std::future<T>&&, std::weak_ptr<Topology> = std::weak_ptr<Topology>()); +}; + +template <typename T> +Future<T>::Future(std::future<T>&& f, std::weak_ptr<Topology> p) : + std::future<T> {std::move(f)}, + _topology {std::move(p)} { +} + +// Function: cancel +template <typename T> +bool Future<T>::cancel() { + if(auto ptr = _topology.lock(); ptr) { + ptr->_state.fetch_or(Topology::CANCELLED, std::memory_order_relaxed); + return true; + } + return false; +} + + +} // end of namespace tf. --------------------------------------------------- diff --git a/myxpcs/include/taskflow_/core/topology.hpp b/myxpcs/include/taskflow_/core/topology.hpp new file mode 100644 index 0000000..068499d --- /dev/null +++ b/myxpcs/include/taskflow_/core/topology.hpp @@ -0,0 +1,62 @@ +#pragma once + +namespace tf { + +// ---------------------------------------------------------------------------- + +class TopologyBase { + +}; + +// class: Topology +class Topology { + + friend class Executor; + friend class Runtime; + friend class Node; + + template <typename T> + friend class Future; + + constexpr static int CLEAN = 0; + constexpr static int CANCELLED = 1; + constexpr static int EXCEPTION = 2; + + public: + + template <typename P, typename C> + Topology(Taskflow&, P&&, C&&); + + private: + + Taskflow& _taskflow; + + std::promise<void> _promise; + + SmallVector<Node*> _sources; + + std::function<bool()> _pred; + std::function<void()> _call; + + std::atomic<size_t> _join_counter {0}; + std::atomic<int> _state {CLEAN}; + + std::exception_ptr _exception {nullptr}; + + void _carry_out_promise(); +}; + +// Constructor +template <typename P, typename C> +Topology::Topology(Taskflow& tf, P&& p, C&& c): + _taskflow(tf), + _pred {std::forward<P>(p)}, + _call {std::forward<C>(c)} { +} + +// Procedure +inline void Topology::_carry_out_promise() { + _exception ? _promise.set_exception(_exception) : _promise.set_value(); +} + +} // end of namespace tf. ---------------------------------------------------- diff --git a/myxpcs/include/taskflow_/core/tsq.hpp b/myxpcs/include/taskflow_/core/tsq.hpp new file mode 100644 index 0000000..e4ea76c --- /dev/null +++ b/myxpcs/include/taskflow_/core/tsq.hpp @@ -0,0 +1,441 @@ +#pragma once + +#include "../utility/macros.hpp" +#include "../utility/traits.hpp" + +/** +@file tsq.hpp +@brief task queue include file +*/ + +namespace tf { + + +// ---------------------------------------------------------------------------- +// Task Types +// ---------------------------------------------------------------------------- + +/** +@enum TaskPriority + +@brief enumeration of all task priority values + +A priority is an enumerated value of type @c unsigned. +Currently, %Taskflow defines three priority levels, +@c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2. +That is, the lower the value, the higher the priority. + +*/ +enum class TaskPriority : unsigned { + /** @brief value of the highest priority (i.e., 0) */ + HIGH = 0, + /** @brief value of the normal priority (i.e., 1) */ + NORMAL = 1, + /** @brief value of the lowest priority (i.e., 2) */ + LOW = 2, + /** @brief conventional value for iterating priority values */ + MAX = 3 +}; + + + +// ---------------------------------------------------------------------------- +// Task Queue +// ---------------------------------------------------------------------------- + + +/** +@class: TaskQueue + +@tparam T data type (must be a pointer type) +@tparam TF_MAX_PRIORITY maximum level of the priority + +@brief class to create a lock-free unbounded single-producer multiple-consumer queue + +This class implements the work-stealing queue described in the paper, +<a href="https://www.di.ens.fr/~zappa/readings/ppopp13.pdf">Correct and Efficient Work-Stealing for Weak Memory Models</a>, +and extends it to include priority. + +Only the queue owner can perform pop and push operations, +while others can steal data from the queue simultaneously. +Priority starts from zero (highest priority) to the template value +`TF_MAX_PRIORITY-1` (lowest priority). +All operations are associated with priority values to indicate +the corresponding queues to which an operation is applied. + +The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` +which applies only three priority levels to the task queue. + +@code{.cpp} +auto [A, B, C, D, E] = taskflow.emplace( + [] () { }, + [&] () { + std::cout << "Task B: " << counter++ << '\n'; // 0 + }, + [&] () { + std::cout << "Task C: " << counter++ << '\n'; // 2 + }, + [&] () { + std::cout << "Task D: " << counter++ << '\n'; // 1 + }, + [] () { } +); + +A.precede(B, C, D); +E.succeed(B, C, D); + +B.priority(tf::TaskPriority::HIGH); +C.priority(tf::TaskPriority::LOW); +D.priority(tf::TaskPriority::NORMAL); + +executor.run(taskflow).wait(); +@endcode + +In the above example, we have a task graph of five tasks, +@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D +can run in simultaneously when @c A finishes. +Since we only uses one worker thread in the executor, +we can deterministically run @c B first, then @c D, and @c C +in order of their priority values. +The output is as follows: + +@code{.shell-session} +Task B: 0 +Task D: 1 +Task C: 2 +@endcode + +*/ +template <typename T, unsigned TF_MAX_PRIORITY = static_cast<unsigned>(TaskPriority::MAX)> +class TaskQueue { + + static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one"); + static_assert(std::is_pointer_v<T>, "T must be a pointer type"); + + struct Array { + + int64_t C; + int64_t M; + std::atomic<T>* S; + + explicit Array(int64_t c) : + C {c}, + M {c-1}, + S {new std::atomic<T>[static_cast<size_t>(C)]} { + } + + ~Array() { + delete [] S; + } + + int64_t capacity() const noexcept { + return C; + } + + void push(int64_t i, T o) noexcept { + S[i & M].store(o, std::memory_order_relaxed); + } + + T pop(int64_t i) noexcept { + return S[i & M].load(std::memory_order_relaxed); + } + + Array* resize(int64_t b, int64_t t) { + Array* ptr = new Array {2*C}; + for(int64_t i=t; i!=b; ++i) { + ptr->push(i, pop(i)); + } + return ptr; + } + + }; + + // Doubling the alignment by 2 seems to generate the most + // decent performance. + CachelineAligned<std::atomic<int64_t>> _top[TF_MAX_PRIORITY]; + CachelineAligned<std::atomic<int64_t>> _bottom[TF_MAX_PRIORITY]; + std::atomic<Array*> _array[TF_MAX_PRIORITY]; + std::vector<Array*> _garbage[TF_MAX_PRIORITY]; + + //std::atomic<T> _cache {nullptr}; + + public: + + /** + @brief constructs the queue with a given capacity + + @param capacity the capacity of the queue (must be power of 2) + */ + explicit TaskQueue(int64_t capacity = 512); + + /** + @brief destructs the queue + */ + ~TaskQueue(); + + /** + @brief queries if the queue is empty at the time of this call + */ + bool empty() const noexcept; + + /** + @brief queries if the queue is empty at a specific priority value + */ + bool empty(unsigned priority) const noexcept; + + /** + @brief queries the number of items at the time of this call + */ + size_t size() const noexcept; + + /** + @brief queries the number of items with the given priority + at the time of this call + */ + size_t size(unsigned priority) const noexcept; + + /** + @brief queries the capacity of the queue + */ + int64_t capacity() const noexcept; + + /** + @brief queries the capacity of the queue at a specific priority value + */ + int64_t capacity(unsigned priority) const noexcept; + + /** + @brief inserts an item to the queue + + @param item the item to push to the queue + @param priority priority value of the item to push (default = 0) + + Only the owner thread can insert an item to the queue. + The operation can trigger the queue to resize its capacity + if more space is required. + */ + TF_FORCE_INLINE void push(T item, unsigned priority); + + /** + @brief pops out an item from the queue + + Only the owner thread can pop out an item from the queue. + The return can be a @c nullptr if this operation failed (empty queue). + */ + T pop(); + + /** + @brief pops out an item with a specific priority value from the queue + + @param priority priority of the item to pop + + Only the owner thread can pop out an item from the queue. + The return can be a @c nullptr if this operation failed (empty queue). + */ + TF_FORCE_INLINE T pop(unsigned priority); + + /** + @brief steals an item from the queue + + Any threads can try to steal an item from the queue. + The return can be a @c nullptr if this operation failed (not necessary empty). + */ + T steal(); + + /** + @brief steals an item with a specific priority value from the queue + + @param priority priority of the item to steal + + Any threads can try to steal an item from the queue. + The return can be a @c nullptr if this operation failed (not necessary empty). + */ + T steal(unsigned priority); + + private: + TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t); +}; + +// Constructor +template <typename T, unsigned TF_MAX_PRIORITY> +TaskQueue<T, TF_MAX_PRIORITY>::TaskQueue(int64_t c) { + assert(c && (!(c & (c-1)))); + unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ + _top[p].data.store(0, std::memory_order_relaxed); + _bottom[p].data.store(0, std::memory_order_relaxed); + _array[p].store(new Array{c}, std::memory_order_relaxed); + _garbage[p].reserve(32); + }); +} + +// Destructor +template <typename T, unsigned TF_MAX_PRIORITY> +TaskQueue<T, TF_MAX_PRIORITY>::~TaskQueue() { + unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ + for(auto a : _garbage[p]) { + delete a; + } + delete _array[p].load(); + }); +} + +// Function: empty +template <typename T, unsigned TF_MAX_PRIORITY> +bool TaskQueue<T, TF_MAX_PRIORITY>::empty() const noexcept { + for(unsigned i=0; i<TF_MAX_PRIORITY; i++) { + if(!empty(i)) { + return false; + } + } + return true; +} + +// Function: empty +template <typename T, unsigned TF_MAX_PRIORITY> +bool TaskQueue<T, TF_MAX_PRIORITY>::empty(unsigned p) const noexcept { + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_relaxed); + return (b <= t); +} + +// Function: size +template <typename T, unsigned TF_MAX_PRIORITY> +size_t TaskQueue<T, TF_MAX_PRIORITY>::size() const noexcept { + size_t s; + unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); }); + return s; +} + +// Function: size +template <typename T, unsigned TF_MAX_PRIORITY> +size_t TaskQueue<T, TF_MAX_PRIORITY>::size(unsigned p) const noexcept { + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_relaxed); + return static_cast<size_t>(b >= t ? b - t : 0); +} + +// Function: push +template <typename T, unsigned TF_MAX_PRIORITY> +TF_FORCE_INLINE void TaskQueue<T, TF_MAX_PRIORITY>::push(T o, unsigned p) { + + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_acquire); + Array* a = _array[p].load(std::memory_order_relaxed); + + // queue is full + if(a->capacity() - 1 < (b - t)) { + a = resize_array(a, p, b, t); + } + + a->push(b, o); + std::atomic_thread_fence(std::memory_order_release); + _bottom[p].data.store(b + 1, std::memory_order_relaxed); +} + +// Function: pop +template <typename T, unsigned TF_MAX_PRIORITY> +T TaskQueue<T, TF_MAX_PRIORITY>::pop() { + for(unsigned i=0; i<TF_MAX_PRIORITY; i++) { + if(auto t = pop(i); t) { + return t; + } + } + return nullptr; +} + +// Function: pop +template <typename T, unsigned TF_MAX_PRIORITY> +TF_FORCE_INLINE T TaskQueue<T, TF_MAX_PRIORITY>::pop(unsigned p) { + + int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1; + Array* a = _array[p].load(std::memory_order_relaxed); + _bottom[p].data.store(b, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t t = _top[p].data.load(std::memory_order_relaxed); + + T item {nullptr}; + + if(t <= b) { + item = a->pop(b); + if(t == b) { + // the last item just got stolen + if(!_top[p].data.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + item = nullptr; + } + _bottom[p].data.store(b + 1, std::memory_order_relaxed); + } + } + else { + _bottom[p].data.store(b + 1, std::memory_order_relaxed); + } + + return item; +} + +// Function: steal +template <typename T, unsigned TF_MAX_PRIORITY> +T TaskQueue<T, TF_MAX_PRIORITY>::steal() { + for(unsigned i=0; i<TF_MAX_PRIORITY; i++) { + if(auto t = steal(i); t) { + return t; + } + } + return nullptr; +} + +// Function: steal +template <typename T, unsigned TF_MAX_PRIORITY> +T TaskQueue<T, TF_MAX_PRIORITY>::steal(unsigned p) { + + int64_t t = _top[p].data.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom[p].data.load(std::memory_order_acquire); + + T item {nullptr}; + + if(t < b) { + Array* a = _array[p].load(std::memory_order_consume); + item = a->pop(t); + if(!_top[p].data.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + return nullptr; + } + } + + return item; +} + +// Function: capacity +template <typename T, unsigned TF_MAX_PRIORITY> +int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity() const noexcept { + size_t s; + unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { + s = i ? capacity(i) + s : capacity(i); + }); + return s; +} + +// Function: capacity +template <typename T, unsigned TF_MAX_PRIORITY> +int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity(unsigned p) const noexcept { + return _array[p].load(std::memory_order_relaxed)->capacity(); +} + +template <typename T, unsigned TF_MAX_PRIORITY> +TF_NO_INLINE typename TaskQueue<T, TF_MAX_PRIORITY>::Array* + TaskQueue<T, TF_MAX_PRIORITY>::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) { + + Array* tmp = a->resize(b, t); + _garbage[p].push_back(a); + std::swap(a, tmp); + _array[p].store(a, std::memory_order_release); + // Note: the original paper using relaxed causes t-san to complain + //_array.store(a, std::memory_order_relaxed); + return a; +} + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/core/worker.hpp b/myxpcs/include/taskflow_/core/worker.hpp new file mode 100644 index 0000000..8f86381 --- /dev/null +++ b/myxpcs/include/taskflow_/core/worker.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include "declarations.hpp" +#include "tsq.hpp" +#include "notifier.hpp" + +/** +@file worker.hpp +@brief worker include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Class Definition: Worker +// ---------------------------------------------------------------------------- + +/** +@class Worker + +@brief class to create a worker in an executor + +The class is primarily used by the executor to perform work-stealing algorithm. +Users can access a worker object and alter its property +(e.g., changing the thread affinity in a POSIX-like system) +using tf::WorkerInterface. +*/ +class Worker { + + friend class Executor; + friend class WorkerView; + + public: + + /** + @brief queries the worker id associated with its parent executor + + A worker id is a unsigned integer in the range <tt>[0, N)</tt>, + where @c N is the number of workers spawned at the construction + time of the executor. + */ + inline size_t id() const { return _id; } + + /** + @brief acquires a pointer access to the underlying thread + */ + inline std::thread* thread() const { return _thread; } + + /** + @brief queries the size of the queue (i.e., number of enqueued tasks to + run) associated with the worker + */ + inline size_t queue_size() const { return _wsq.size(); } + + /** + @brief queries the current capacity of the queue + */ + inline size_t queue_capacity() const { return static_cast<size_t>(_wsq.capacity()); } + + private: + + size_t _id; + size_t _vtm; + Executor* _executor; + std::thread* _thread; + Notifier::Waiter* _waiter; + std::default_random_engine _rdgen { std::random_device{}() }; + TaskQueue<Node*> _wsq; + Node* _cache; +}; + +// ---------------------------------------------------------------------------- +// Class Definition: PerThreadWorker +// ---------------------------------------------------------------------------- + +/** +@private +*/ +//struct PerThreadWorker { +// +// Worker* worker; +// +// PerThreadWorker() : worker {nullptr} {} +// +// PerThreadWorker(const PerThreadWorker&) = delete; +// PerThreadWorker(PerThreadWorker&&) = delete; +// +// PerThreadWorker& operator = (const PerThreadWorker&) = delete; +// PerThreadWorker& operator = (PerThreadWorker&&) = delete; +//}; + +/** +@private +*/ +//inline PerThreadWorker& this_worker() { +// thread_local PerThreadWorker worker; +// return worker; +//} + + +// ---------------------------------------------------------------------------- +// Class Definition: WorkerView +// ---------------------------------------------------------------------------- + +/** +@class WorkerView + +@brief class to create an immutable view of a worker in an executor + +An executor keeps a set of internal worker threads to run tasks. +A worker view provides users an immutable interface to observe +when a worker runs a task, and the view object is only accessible +from an observer derived from tf::ObserverInterface. +*/ +class WorkerView { + + friend class Executor; + + public: + + /** + @brief queries the worker id associated with its parent executor + + A worker id is a unsigned integer in the range <tt>[0, N)</tt>, + where @c N is the number of workers spawned at the construction + time of the executor. + */ + size_t id() const; + + /** + @brief queries the size of the queue (i.e., number of pending tasks to + run) associated with the worker + */ + size_t queue_size() const; + + /** + @brief queries the current capacity of the queue + */ + size_t queue_capacity() const; + + private: + + WorkerView(const Worker&); + WorkerView(const WorkerView&) = default; + + const Worker& _worker; + +}; + +// Constructor +inline WorkerView::WorkerView(const Worker& w) : _worker{w} { +} + +// function: id +inline size_t WorkerView::id() const { + return _worker._id; +} + +// Function: queue_size +inline size_t WorkerView::queue_size() const { + return _worker._wsq.size(); +} + +// Function: queue_capacity +inline size_t WorkerView::queue_capacity() const { + return static_cast<size_t>(_worker._wsq.capacity()); +} + + +} // end of namespact tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/find.hpp b/myxpcs/include/taskflow_/cuda/algorithm/find.hpp new file mode 100644 index 0000000..f344666 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/find.hpp @@ -0,0 +1,294 @@ +#pragma once + +#include "for_each.hpp" +#include "reduce.hpp" + +/** +@file taskflow/cuda/algorithm/find.hpp +@brief cuda find algorithms include file +*/ + +namespace tf::detail { + +/** @private */ +template <typename T> +struct cudaFindPair { + + T key; + unsigned index; + + __device__ operator unsigned () const { return index; } +}; + +/** @private */ +template <typename P, typename I, typename U> +void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) { + + if(count == 0) { + cuda_single_task(p, [=] __device__ () { *idx = 0; }); + return; + } + + using E = std::decay_t<P>; + + auto B = (count + E::nv - 1) / E::nv; + + // set the index to the maximum + cuda_single_task(p, [=] __device__ () { *idx = count; }); + + // launch the kernel to atomic-find the minimum + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + __shared__ unsigned shm_id; + + if(!tid) { + shm_id = count; + } + + __syncthreads(); + + auto tile = cuda_get_tile(bid, E::nv, count); + + auto x = cuda_mem_to_reg_strided<E::nt, E::vt>( + input + tile.begin, tid, tile.count() + ); + + auto id = count; + + for(unsigned i=0; i<E::vt; i++) { + auto j = E::nt*i + tid; + if(j < tile.count() && pred(x[i])) { + id = j + tile.begin; + break; + } + } + + // Note: the reduce version is not faster though + // reduce to a scalar per block. + //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm; + + //id = cudaBlockReduce<E::nt, unsigned>()( + // tid, + // id, + // shm, + // (tile.count() < E::nt ? tile.count() : E::nt), + // cuda_minimum<unsigned>{}, + // false + //); + + // only need the minimum id + atomicMin(&shm_id, id); + __syncthreads(); + + // reduce all to the global memory + if(!tid) { + atomicMin(idx, shm_id); + //atomicMin(idx, id); + } + }); +} + +/** @private */ +template <typename P, typename I, typename O> +void cuda_min_element_loop( + P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr +) { + + if(count == 0) { + cuda_single_task(p, [=] __device__ () { *idx = 0; }); + return; + } + + using T = cudaFindPair<typename std::iterator_traits<I>::value_type>; + + cuda_uninitialized_reduce_loop(p, + cuda_make_load_iterator<T>([=]__device__(auto i){ + return T{*(input+i), i}; + }), + count, + idx, + [=] __device__ (const auto& a, const auto& b) { + return op(a.key, b.key) ? a : b; + }, + ptr + ); +} + +/** @private */ +template <typename P, typename I, typename O> +void cuda_max_element_loop( + P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr +) { + + if(count == 0) { + cuda_single_task(p, [=] __device__ () { *idx = 0; }); + return; + } + + using T = cudaFindPair<typename std::iterator_traits<I>::value_type>; + + cuda_uninitialized_reduce_loop(p, + cuda_make_load_iterator<T>([=]__device__(auto i){ + return T{*(input+i), i}; + }), + count, + idx, + [=] __device__ (const auto& a, const auto& b) { + return op(a.key, b.key) ? b : a; + }, + ptr + ); +} + +} // end of namespace tf::detail --------------------------------------------- + +namespace tf { + + +// ---------------------------------------------------------------------------- +// cuda_find_if +// ---------------------------------------------------------------------------- + +/** +@brief finds the index of the first element that satisfies the given criteria + +@tparam P execution policy type +@tparam I input iterator type +@tparam U unary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param idx pointer to the index of the found element +@param op unary operator which returns @c true for the required element + +The function launches kernels asynchronously to find the index @c idx of the +first element in the range <tt>[first, last)</tt> +such that <tt>op(*(first+idx))</tt> is true. +This is equivalent to the parallel execution of the following loop: + +@code{.cpp} +unsigned idx = 0; +for(; first != last; ++first, ++idx) { + if (p(*first)) { + return idx; + } +} +return idx; +@endcode +*/ +template <typename P, typename I, typename U> +void cuda_find_if( + P&& p, I first, I last, unsigned* idx, U op +) { + detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op); +} + +// ---------------------------------------------------------------------------- +// cuda_min_element +// ---------------------------------------------------------------------------- + +// Function: min-element_bufsz +template <unsigned NT, unsigned VT> +template <typename T> +unsigned cudaExecutionPolicy<NT, VT>::min_element_bufsz(unsigned count) { + return reduce_bufsz<detail::cudaFindPair<T>>(count); +} + +/** +@brief finds the index of the minimum element in a range + +@tparam P execution policy type +@tparam I input iterator type +@tparam O comparator type + +@param p execution policy object +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param idx solution index of the minimum element +@param op comparison function object +@param buf pointer to the buffer + +The function launches kernels asynchronously to find +the smallest element in the range <tt>[first, last)</tt> +using the given comparator @c op. +You need to provide a buffer that holds at least +tf::cuda_min_element_bufsz bytes for internal use. +The function is equivalent to a parallel execution of the following loop: + +@code{.cpp} +if(first == last) { + return 0; +} +auto smallest = first; +for (++first; first != last; ++first) { + if (op(*first, *smallest)) { + smallest = first; + } +} +return std::distance(first, smallest); +@endcode +*/ +template <typename P, typename I, typename O> +void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) { + detail::cuda_min_element_loop( + p, first, std::distance(first, last), idx, op, buf + ); +} + +// ---------------------------------------------------------------------------- +// cuda_max_element +// ---------------------------------------------------------------------------- + +// Function: max_element_bufsz +template <unsigned NT, unsigned VT> +template <typename T> +unsigned cudaExecutionPolicy<NT, VT>::max_element_bufsz(unsigned count) { + return reduce_bufsz<detail::cudaFindPair<T>>(count); +} + +/** +@brief finds the index of the maximum element in a range + +@tparam P execution policy type +@tparam I input iterator type +@tparam O comparator type + +@param p execution policy object +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param idx solution index of the maximum element +@param op comparison function object +@param buf pointer to the buffer + +The function launches kernels asynchronously to find +the largest element in the range <tt>[first, last)</tt> +using the given comparator @c op. +You need to provide a buffer that holds at least +tf::cuda_max_element_bufsz bytes for internal use. +The function is equivalent to a parallel execution of the following loop: + +@code{.cpp} +if(first == last) { + return 0; +} +auto largest = first; +for (++first; first != last; ++first) { + if (op(*largest, *first)) { + largest = first; + } +} +return std::distance(first, largest); +@endcode +*/ +template <typename P, typename I, typename O> +void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) { + detail::cuda_max_element_loop( + p, first, std::distance(first, last), idx, op, buf + ); +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp b/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp new file mode 100644 index 0000000..38a6f85 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/for_each.hpp @@ -0,0 +1,315 @@ +#pragma once + +#include "../cudaflow.hpp" + +/** +@file taskflow/cuda/algorithm/for_each.hpp +@brief cuda parallel-iteration algorithms include file +*/ + +namespace tf { + +namespace detail { + +/** +@private +*/ +template <size_t nt, size_t vt, typename I, typename C> +__global__ void cuda_for_each_kernel(I first, unsigned count, C c) { + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + cuda_strided_iterate<nt, vt>( + [=](auto, auto j) { + c(*(first + tile.begin + j)); + }, + tid, tile.count() + ); +} + +/** @private */ +template <size_t nt, size_t vt, typename I, typename C> +__global__ void cuda_for_each_index_kernel(I first, I inc, unsigned count, C c) { + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + cuda_strided_iterate<nt, vt>( + [=]__device__(auto, auto j) { + c(first + inc*(tile.begin+j)); + }, + tid, tile.count() + ); +} + +} // end of namespace detail ------------------------------------------------- + +// ---------------------------------------------------------------------------- +// cuda standard algorithms: single_task/for_each/for_each_index +// ---------------------------------------------------------------------------- + +/** +@brief runs a callable asynchronously using one kernel thread + +@tparam P execution policy type +@tparam C closure type + +@param p execution policy +@param c closure to run by one kernel thread + +The function launches a single kernel thread to run the given callable +through the stream in the execution policy object. +*/ +template <typename P, typename C> +void cuda_single_task(P&& p, C c) { + cuda_kernel<<<1, 1, 0, p.stream()>>>( + [=]__device__(auto, auto) mutable { c(); } + ); +} + +/** +@brief performs asynchronous parallel iterations over a range of items + +@tparam P execution policy type +@tparam I input iterator type +@tparam C unary operator type + +@param p execution policy object +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param c unary operator to apply to each dereferenced iterator + +This function is equivalent to a parallel execution of the following loop +on a GPU: + +@code{.cpp} +for(auto itr = first; itr != last; itr++) { + c(*itr); +} +@endcode +*/ +template <typename P, typename I, typename C> +void cuda_for_each(P&& p, I first, I last, C c) { + + using E = std::decay_t<P>; + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + detail::cuda_for_each_kernel<E::nt, E::vt, I, C><<<E::num_blocks(count), E::nt, 0, p.stream()>>>( + first, count, c + ); +} + +/** +@brief performs asynchronous parallel iterations over + an index-based range of items + +@tparam P execution policy type +@tparam I input index type +@tparam C unary operator type + +@param p execution policy object +@param first index to the beginning of the range +@param last index to the end of the range +@param inc step size between successive iterations +@param c unary operator to apply to each index + +This function is equivalent to a parallel execution of +the following loop on a GPU: + +@code{.cpp} +// step is positive [first, last) +for(auto i=first; i<last; i+=step) { + c(i); +} + +// step is negative [first, last) +for(auto i=first; i>last; i+=step) { + c(i); +} +@endcode +*/ +template <typename P, typename I, typename C> +void cuda_for_each_index(P&& p, I first, I last, I inc, C c) { + + using E = std::decay_t<P>; + + unsigned count = distance(first, last, inc); + + if(count == 0) { + return; + } + + detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C><<<E::num_blocks(count), E::nt, 0, p.stream()>>>( + first, inc, count, c + ); +} + +// ---------------------------------------------------------------------------- +// single_task +// ---------------------------------------------------------------------------- + +/** @private */ +template <typename C> +__global__ void cuda_single_task(C callable) { + callable(); +} + +// Function: single_task +template <typename C> +cudaTask cudaFlow::single_task(C c) { + return kernel(1, 1, 0, cuda_single_task<C>, c); +} + +// Function: single_task +template <typename C> +void cudaFlow::single_task(cudaTask task, C c) { + return kernel(task, 1, 1, 0, cuda_single_task<C>, c); +} + +// Function: single_task +template <typename C> +cudaTask cudaFlowCapturer::single_task(C callable) { + return on([=] (cudaStream_t stream) mutable { + cuda_single_task(cudaDefaultExecutionPolicy(stream), callable); + }); +} + +// Function: single_task +template <typename C> +void cudaFlowCapturer::single_task(cudaTask task, C callable) { + on(task, [=] (cudaStream_t stream) mutable { + cuda_single_task(cudaDefaultExecutionPolicy(stream), callable); + }); +} + +// ---------------------------------------------------------------------------- +// cudaFlow: for_each, for_each_index +// ---------------------------------------------------------------------------- + +// Function: for_each +template <typename I, typename C> +cudaTask cudaFlow::for_each(I first, I last, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first, last); + + // TODO: + //if(count == 0) { + // return; + //} + + return kernel( + E::num_blocks(count), E::nt, 0, + detail::cuda_for_each_kernel<E::nt, E::vt, I, C>, first, count, c + ); +} + +// Function: for_each +template <typename I, typename C> +void cudaFlow::for_each(cudaTask task, I first, I last, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first, last); + + // TODO: + //if(count == 0) { + // return; + //} + + kernel(task, + E::num_blocks(count), E::nt, 0, + detail::cuda_for_each_kernel<E::nt, E::vt, I, C>, first, count, c + ); +} + +// Function: for_each_index +template <typename I, typename C> +cudaTask cudaFlow::for_each_index(I first, I last, I inc, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = distance(first, last, inc); + + // TODO: + //if(count == 0) { + // return; + //} + + return kernel( + E::num_blocks(count), E::nt, 0, + detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C>, first, inc, count, c + ); +} + +// Function: for_each_index +template <typename I, typename C> +void cudaFlow::for_each_index(cudaTask task, I first, I last, I inc, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = distance(first, last, inc); + + // TODO: + //if(count == 0) { + // return; + //} + + return kernel(task, + E::num_blocks(count), E::nt, 0, + detail::cuda_for_each_index_kernel<E::nt, E::vt, I, C>, first, inc, count, c + ); +} + +// ---------------------------------------------------------------------------- +// cudaFlowCapturer: for_each, for_each_index +// ---------------------------------------------------------------------------- + +// Function: for_each +template <typename I, typename C> +cudaTask cudaFlowCapturer::for_each(I first, I last, C c) { + return on([=](cudaStream_t stream) mutable { + cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c); + }); +} + +// Function: for_each_index +template <typename I, typename C> +cudaTask cudaFlowCapturer::for_each_index(I beg, I end, I inc, C c) { + return on([=] (cudaStream_t stream) mutable { + cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c); + }); +} + +// Function: for_each +template <typename I, typename C> +void cudaFlowCapturer::for_each(cudaTask task, I first, I last, C c) { + on(task, [=](cudaStream_t stream) mutable { + cuda_for_each(cudaDefaultExecutionPolicy(stream), first, last, c); + }); +} + +// Function: for_each_index +template <typename I, typename C> +void cudaFlowCapturer::for_each_index( + cudaTask task, I beg, I end, I inc, C c +) { + on(task, [=] (cudaStream_t stream) mutable { + cuda_for_each_index(cudaDefaultExecutionPolicy(stream), beg, end, inc, c); + }); +} + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp b/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp new file mode 100644 index 0000000..d0f6620 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/matmul.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "../cudaflow.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// row-major matrix multiplication +// ---------------------------------------------------------------------------- + +template <typename T> +__global__ void cuda_matmul( + const T* A, + const T* B, + T* C, + size_t M, + size_t K, + size_t N +) { + __shared__ T A_tile[32][32]; + __shared__ T B_tile[32][32]; + + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + T res = 0; + + for(size_t k = 0; k < K; k += 32) { + if((threadIdx.x + k) < K && y < M) { + A_tile[threadIdx.y][threadIdx.x] = A[y * K + threadIdx.x + k]; + } + else{ + A_tile[threadIdx.y][threadIdx.x] = 0; + } + + if((threadIdx.y + k) < K && x < N) { + B_tile[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k) * N + x]; + } + else{ + B_tile[threadIdx.y][threadIdx.x] = 0; + } + + __syncthreads(); + + for(size_t i = 0; i < 32; ++i) { + res += A_tile[threadIdx.y][i] * B_tile[i][threadIdx.x]; + } + __syncthreads(); + } + + if(x < N && y < M) { + C[y * N + x] = res; + } + +} + +} // end of namespace tf --------------------------------------------------------- diff --git a/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp b/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp new file mode 100644 index 0000000..d325491 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/merge.hpp @@ -0,0 +1,585 @@ +#pragma once + +#include "../cudaflow.hpp" + +/** +@file taskflow/cuda/algorithm/merge.hpp +@brief CUDA merge algorithm include file +*/ + +namespace tf::detail { + +/** +@private +@brief merge bound type +*/ +enum class cudaMergeBoundType { + LOWER, + UPPER +}; + +/** @private */ +template<typename T, unsigned N> +struct cudaMergePair { + cudaArray<T, N> keys; + cudaArray<unsigned, N> indices; +}; + +/** @private */ +struct cudaMergeRange { + unsigned a_begin, a_end, b_begin, b_end; + + __device__ unsigned a_count() const { return a_end - a_begin; } + __device__ unsigned b_count() const { return b_end - b_begin; } + __device__ unsigned total() const { return a_count() + b_count(); } + + __device__ cudaRange a_range() const { + return cudaRange { a_begin, a_end }; + } + __device__ cudaRange b_range() const { + return cudaRange { b_begin, b_end }; + } + + __device__ cudaMergeRange to_local() const { + return cudaMergeRange { 0, a_count(), a_count(), total() }; + } + + // Partition from mp to the end. + __device__ cudaMergeRange partition(unsigned mp0, unsigned diag) const { + return cudaMergeRange { a_begin + mp0, a_end, b_begin + diag - mp0, b_end }; + } + + // Partition from mp0 to mp1. + __device__ cudaMergeRange partition(unsigned mp0, unsigned diag0, + unsigned mp1, unsigned diag1) const { + return cudaMergeRange { + a_begin + mp0, + a_begin + mp1, + b_begin + diag0 - mp0, + b_begin + diag1 - mp1 + }; + } + + __device__ bool a_valid() const { + return a_begin < a_end; + } + + __device__ bool b_valid() const { + return b_begin < b_end; + } +}; + +/** @private */ +template< + cudaMergeBoundType bounds = cudaMergeBoundType::LOWER, + typename a_keys_it, typename b_keys_it, typename comp_t +> +__device__ auto cuda_merge_path( + a_keys_it a_keys, unsigned a_count, + b_keys_it b_keys, unsigned b_count, + unsigned diag, comp_t comp +) { + + unsigned beg = (diag > b_count) ? diag - b_count : 0; + unsigned end = diag < a_count ? diag : a_count; + + while(beg < end) { + auto mid = (beg + end) / 2; + auto a_key = a_keys[mid]; + auto b_key = b_keys[diag - 1 - mid]; + bool pred = (cudaMergeBoundType::UPPER == bounds) ? + comp(a_key, b_key) : + !comp(b_key, a_key); + + if(pred) beg = mid + 1; + else end = mid; + } + return beg; +} + +/** @private */ +template<cudaMergeBoundType bounds, typename keys_it, typename comp_t> +__device__ auto cuda_merge_path( + keys_it keys, cudaMergeRange range, unsigned diag, comp_t comp +) { + + return cuda_merge_path<bounds>( + keys + range.a_begin, range.a_count(), + keys + range.b_begin, range.b_count(), + diag, comp); +} + +/** @private */ +template<cudaMergeBoundType bounds, bool range_check, typename T, typename comp_t> +__device__ bool cuda_merge_predicate( + T a_key, T b_key, cudaMergeRange range, comp_t comp +) { + + bool p; + if(range_check && !range.a_valid()) { + p = false; + } + else if(range_check && !range.b_valid()) { + p = true; + } + else { + p = (cudaMergeBoundType::UPPER == bounds) ? comp(a_key, b_key) : + !comp(b_key, a_key); + } + return p; +} + +/** @private */ +inline __device__ auto cuda_compute_merge_range( + unsigned a_count, unsigned b_count, + unsigned partition, unsigned spacing, + unsigned mp0, unsigned mp1 +) { + + auto diag0 = spacing * partition; + auto diag1 = min(a_count + b_count, diag0 + spacing); + + return cudaMergeRange { mp0, mp1, diag0 - mp0, diag1 - mp1 }; +} + +/** +@private + +Specialization that emits just one LD instruction. Can only reliably used +with raw pointer types. Fixed not to use pointer arithmetic so that +we don't get undefined behaviors with unaligned types. +*/ +template<unsigned nt, unsigned vt, typename T> +__device__ auto cuda_load_two_streams_reg( + const T* a, unsigned a_count, const T* b, unsigned b_count, unsigned tid +) { + + b -= a_count; + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt>([&](auto i, auto index) { + const T* p = (index >= a_count) ? b : a; + x[i] = p[index]; + }, tid, a_count + b_count); + + return x; +} + +/** @private */ +template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it> +__device__ +std::enable_if_t< + !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), + cudaArray<T, vt> +> load_two_streams_reg(a_it a, unsigned a_count, b_it b, unsigned b_count, unsigned tid) { + b -= a_count; + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt>([&](auto i, auto index) { + x[i] = (index < a_count) ? a[index] : b[index]; + }, tid, a_count + b_count); + return x; +} + +/** @private */ +template<unsigned nt, unsigned vt, typename A, typename B, typename T, unsigned S> +__device__ void cuda_load_two_streams_shared(A a, unsigned a_count, + B b, unsigned b_count, unsigned tid, T (&shared)[S], bool sync = true +) { + // Load into register then make an unconditional strided store into memory. + auto x = cuda_load_two_streams_reg<nt, vt, T>(a, a_count, b, b_count, tid); + cuda_reg_to_shared_strided<nt>(x, tid, shared, sync); +} + +/** @private */ +template<unsigned nt, unsigned vt, typename T> +__device__ auto cuda_gather_two_streams_strided(const T* a, + unsigned a_count, const T* b, unsigned b_count, cudaArray<unsigned, vt> indices, + unsigned tid) { + + ptrdiff_t b_offset = b - a - a_count; + auto count = a_count + b_count; + + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt>([&](auto i, auto j) { + ptrdiff_t gather = indices[i]; + if(gather >= a_count) gather += b_offset; + x[i] = a[gather]; + }, tid, count); + + return x; +} + +/** @private */ +template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it> +__device__ +std::enable_if_t< + !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), + cudaArray<T, vt> +> cuda_gather_two_streams_strided(a_it a, + unsigned a_count, b_it b, unsigned b_count, cudaArray<unsigned, vt> indices, unsigned tid) { + + b -= a_count; + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt>([&](auto i, auto j) { + x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i]]; + }, tid, a_count + b_count); + + return x; +} + +/** @private */ +template<unsigned nt, unsigned vt, typename a_it, typename b_it, typename c_it> +__device__ void cuda_transfer_two_streams_strided( + a_it a, unsigned a_count, b_it b, unsigned b_count, + cudaArray<unsigned, vt> indices, unsigned tid, c_it c +) { + + using T = typename std::iterator_traits<a_it>::value_type; + auto x = cuda_gather_two_streams_strided<nt, vt, T>( + a, a_count, b, b_count, indices, tid + ); + + cuda_reg_to_mem_strided<nt>(x, tid, a_count + b_count, c); +} + + +/** +@private + +This function must be able to dereference keys[a_begin] and keys[b_begin], +no matter the indices for each. The caller should allocate at least +nt * vt + 1 elements for +*/ +template<cudaMergeBoundType bounds, unsigned vt, typename T, typename comp_t> +__device__ auto cuda_serial_merge( + const T* keys_shared, cudaMergeRange range, comp_t comp, bool sync = true +) { + + auto a_key = keys_shared[range.a_begin]; + auto b_key = keys_shared[range.b_begin]; + + cudaMergePair<T, vt> merge_pair; + cuda_iterate<vt>([&](auto i) { + bool p = cuda_merge_predicate<bounds, true>(a_key, b_key, range, comp); + auto index = p ? range.a_begin : range.b_begin; + + merge_pair.keys[i] = p ? a_key : b_key; + merge_pair.indices[i] = index; + + T c_key = keys_shared[++index]; + if(p) a_key = c_key, range.a_begin = index; + else b_key = c_key, range.b_begin = index; + }); + + if(sync) __syncthreads(); + return merge_pair; +} + +/** +@private + +Load arrays a and b from global memory and merge unsignedo register. +*/ +template<cudaMergeBoundType bounds, + unsigned nt, unsigned vt, + typename a_it, typename b_it, typename T, typename comp_t, unsigned S +> +__device__ auto block_merge_from_mem( + a_it a, b_it b, cudaMergeRange range_mem, unsigned tid, comp_t comp, T (&keys_shared)[S] +) { + + static_assert(S >= nt * vt + 1, + "block_merge_from_mem requires temporary storage of at " + "least nt * vt + 1 items"); + + // Load the data into shared memory. + cuda_load_two_streams_shared<nt, vt>( + a + range_mem.a_begin, range_mem.a_count(), + b + range_mem.b_begin, range_mem.b_count(), + tid, keys_shared, true + ); + + // Run a merge path to find the start of the serial merge for each thread. + auto range_local = range_mem.to_local(); + auto diag = vt * tid; + auto mp = cuda_merge_path<bounds>(keys_shared, range_local, diag, comp); + + // Compute the ranges of the sources in shared memory. The end iterators + // of the range are inaccurate, but still facilitate exact merging, because + // only vt elements will be merged. + auto merged = cuda_serial_merge<bounds, vt>( + keys_shared, range_local.partition(mp, diag), comp + ); + + return merged; +}; + +/** @private */ +template<cudaMergeBoundType bounds, + typename P, typename a_keys_it, typename b_keys_it, typename comp_t +> +void cuda_merge_path_partitions( + P&& p, + a_keys_it a, unsigned a_count, + b_keys_it b, unsigned b_count, + unsigned spacing, + comp_t comp, + unsigned* buf +) { + + //int num_partitions = (int)div_up(a_count + b_count, spacing) + 1; + + unsigned num_partitions = (a_count + b_count + spacing - 1) / spacing + 1; + + const unsigned nt = 128; + const unsigned vt = 1; + const unsigned nv = nt * vt; + + unsigned B = (num_partitions + nv - 1) / nv; // nt = 128, vt = 1 + + cuda_kernel<<<B, nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) { + auto range = cuda_get_tile(bid, nt * vt, num_partitions); + cuda_strided_iterate<nt, vt>([=](auto, auto j) { + auto index = range.begin + j; + auto diag = min(spacing * index, a_count + b_count); + buf[index] = cuda_merge_path<bounds>(a, a_count, b, b_count, diag, comp); + }, tid, range.count()); + }); +} + +//template<typename segments_it> +//auto load_balance_partitions(int64_t dest_count, segments_it segments, +// int num_segments, int spacing, context_t& context) -> +// mem_t<typename std::iterator_traits<segments_it>::value_type> { +// +// typedef typename std::iterator_traits<segments_it>::value_type int_t; +// return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0), +// dest_count, segments, num_segments, spacing, less_t<int_t>(), context); +//} + +//template<bounds_t bounds, typename keys_it> +//mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items, +// int spacing, context_t& context) { +// +// int num_partitions = div_up(count, spacing) + 1; +// mem_t<int> mem(num_partitions, context); +// int* p = mem.data(); +// transform([=]MGPU_DEVICE(int index) { +// int key = min(spacing * index, count); +// p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>()); +// }, num_partitions, context); +// return mem; +//} + +/** @private */ +template< + typename P, + typename a_keys_it, typename a_vals_it, + typename b_keys_it, typename b_vals_it, + typename c_keys_it, typename c_vals_it, + typename comp_t +> +void cuda_merge_loop( + P&& p, + a_keys_it a_keys, a_vals_it a_vals, unsigned a_count, + b_keys_it b_keys, b_vals_it b_vals, unsigned b_count, + c_keys_it c_keys, c_vals_it c_vals, + comp_t comp, + void* ptr +) { + + using E = std::decay_t<P>; + using T = typename std::iterator_traits<a_keys_it>::value_type; + using V = typename std::iterator_traits<a_vals_it>::value_type; + + auto buf = static_cast<unsigned*>(ptr); + + auto has_values = !std::is_same<V, cudaEmpty>::value; + + cuda_merge_path_partitions<cudaMergeBoundType::LOWER>( + p, a_keys, a_count, b_keys, b_count, E::nv, comp, buf + ); + + unsigned B = p.num_blocks(a_count + b_count); + + // we use small kernel + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + __shared__ union { + T keys[E::nv + 1]; + unsigned indices[E::nv]; + } shared; + + // Load the range for this CTA and merge the values into register. + auto mp0 = buf[bid + 0]; + auto mp1 = buf[bid + 1]; + auto range = cuda_compute_merge_range(a_count, b_count, bid, E::nv, mp0, mp1); + + auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>( + a_keys, b_keys, range, tid, comp, shared.keys + ); + + auto dest_offset = E::nv * bid; + cuda_reg_to_mem_thread<E::nt>( + merge.keys, tid, range.total(), c_keys + dest_offset, shared.keys + ); + + if(has_values) { + // Transpose the indices from thread order to strided order. + auto indices = cuda_reg_thread_to_strided<E::nt>( + merge.indices, tid, shared.indices + ); + + // Gather the input values and merge into the output values. + cuda_transfer_two_streams_strided<E::nt>( + a_vals + range.a_begin, range.a_count(), + b_vals + range.b_begin, range.b_count(), indices, tid, + c_vals + dest_offset + ); + } + }); +} + +} // end of namespace tf::detail --------------------------------------------- + +namespace tf { + +// ---------------------------------------------------------------------------- +// standalone merge algorithms +// ---------------------------------------------------------------------------- + +// Function: merge_bufsz +template <unsigned NT, unsigned VT> +unsigned cudaExecutionPolicy<NT, VT>::merge_bufsz(unsigned a_count, unsigned b_count) { + return sizeof(unsigned) * (num_blocks(a_count + b_count + nv) + 1); +} + + +// ---------------------------------------------------------------------------- +// key-value merge +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous key-value merge over a range of keys and values + +@tparam P execution policy type +@tparam a_keys_it first key iterator type +@tparam a_vals_it first value iterator type +@tparam b_keys_it second key iterator type +@tparam b_vals_it second value iterator type +@tparam c_keys_it output key iterator type +@tparam c_vals_it output value iterator type +@tparam C comparator type + +@param p execution policy +@param a_keys_first iterator to the beginning of the first key range +@param a_keys_last iterator to the end of the first key range +@param a_vals_first iterator to the beginning of the first value range +@param b_keys_first iterator to the beginning of the second key range +@param b_keys_last iterator to the end of the second key range +@param b_vals_first iterator to the beginning of the second value range +@param c_keys_first iterator to the beginning of the output key range +@param c_vals_first iterator to the beginning of the output value range +@param comp comparator +@param buf pointer to the temporary buffer + +Performs a key-value merge that copies elements from +<tt>[a_keys_first, a_keys_last)</tt> and <tt>[b_keys_first, b_keys_last)</tt> +into a single range, <tt>[c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))</tt> +such that the resulting range is in ascending key order. + +At the same time, the merge copies elements from the two associated ranges +<tt>[a_vals_first + (a_keys_last - a_keys_first))</tt> and +<tt>[b_vals_first + (b_keys_last - b_keys_first))</tt> into a single range, +<tt>[c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))</tt> +such that the resulting range is in ascending order +implied by each input element's associated key. + +For example, assume: + + @c a_keys = {1, 8}; + + @c a_vals = {2, 1}; + + @c b_keys = {3, 7}; + + @c b_vals = {3, 4}; + +After the merge, we have: + + @c c_keys = {1, 3, 7, 8} + + @c c_vals = {2, 3, 4, 1} + +*/ +template< + typename P, + typename a_keys_it, typename a_vals_it, + typename b_keys_it, typename b_vals_it, + typename c_keys_it, typename c_vals_it, + typename C +> +void cuda_merge_by_key( + P&& p, + a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, + b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, + c_keys_it c_keys_first, c_vals_it c_vals_first, + C comp, + void* buf +) { + + unsigned a_count = std::distance(a_keys_first, a_keys_last); + unsigned b_count = std::distance(b_keys_first, b_keys_last); + + if(a_count + b_count == 0) { + return; + } + + detail::cuda_merge_loop(p, + a_keys_first, a_vals_first, a_count, + b_keys_first, b_vals_first, b_count, + c_keys_first, c_vals_first, comp, + buf + ); +} + +// ---------------------------------------------------------------------------- +// key-only merge +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous key-only merge over a range of keys + +@tparam P execution policy type +@tparam a_keys_it first key iterator type +@tparam b_keys_it second key iterator type +@tparam c_keys_it output key iterator type +@tparam C comparator type + +@param p execution policy +@param a_keys_first iterator to the beginning of the first key range +@param a_keys_last iterator to the end of the first key range +@param b_keys_first iterator to the beginning of the second key range +@param b_keys_last iterator to the end of the second key range +@param c_keys_first iterator to the beginning of the output key range +@param comp comparator +@param buf pointer to the temporary buffer + +This function is equivalent to tf::cuda_merge_by_key without values. + +*/ +template<typename P, + typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C +> +void cuda_merge( + P&& p, + a_keys_it a_keys_first, a_keys_it a_keys_last, + b_keys_it b_keys_first, b_keys_it b_keys_last, + c_keys_it c_keys_first, + C comp, + void* buf +) { + cuda_merge_by_key( + p, + a_keys_first, a_keys_last, (const cudaEmpty*)nullptr, + b_keys_first, b_keys_last, (const cudaEmpty*)nullptr, + c_keys_first, (cudaEmpty*)nullptr, comp, + buf + ); +} + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp b/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp new file mode 100644 index 0000000..d6ba332 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/reduce.hpp @@ -0,0 +1,460 @@ +#pragma once + +#include "../cudaflow.hpp" + +/** +@file taskflow/cuda/algorithm/reduce.hpp +@brief cuda reduce algorithms include file +*/ + +namespace tf::detail { + +// ---------------------------------------------------------------------------- +// reduction helper functions +// ---------------------------------------------------------------------------- + +/** @private */ +template<unsigned nt, typename T> +struct cudaBlockReduce { + + static const unsigned group_size = std::min(nt, CUDA_WARP_SIZE); + static const unsigned num_passes = log2(group_size); + static const unsigned num_items = nt / group_size; + + static_assert( + nt && (0 == nt % CUDA_WARP_SIZE), + "cudaBlockReduce requires num threads to be a multiple of warp_size (32)" + ); + + /** @private */ + struct Storage { + T data[std::max(nt, 2 * group_size)]; + }; + + template<typename op_t> + __device__ T operator()(unsigned, T, Storage&, unsigned, op_t, bool = true) const; +}; + +// function: reduce to be called from a block +template<unsigned nt, typename T> +template<typename op_t> +__device__ T cudaBlockReduce<nt, T>::operator ()( + unsigned tid, T x, Storage& storage, unsigned count, op_t op, bool ret +) const { + + // Store your data into shared memory. + storage.data[tid] = x; + __syncthreads(); + + if(tid < group_size) { + // Each thread scans within its lane. + cuda_strided_iterate<group_size, num_items>([&](auto i, auto j) { + if(i > 0) { + x = op(x, storage.data[j]); + } + }, tid, count); + storage.data[tid] = x; + } + __syncthreads(); + + auto count2 = count < group_size ? count : group_size; + auto first = (1 & num_passes) ? group_size : 0; + if(tid < group_size) { + storage.data[first + tid] = x; + } + __syncthreads(); + + cuda_iterate<num_passes>([&](auto pass) { + if(tid < group_size) { + if(auto offset = 1 << pass; tid + offset < count2) { + x = op(x, storage.data[first + offset + tid]); + } + first = group_size - first; + storage.data[first + tid] = x; + } + __syncthreads(); + }); + + if(ret) { + x = storage.data[0]; + __syncthreads(); + } + return x; +} + +// ---------------------------------------------------------------------------- +// cuda_reduce +// ---------------------------------------------------------------------------- + +/** +@private +*/ +template <size_t nt, size_t vt, typename I, typename T, typename O> +__global__ void cuda_reduce_kernel( + I input, unsigned count, T* res, O op, void* ptr +) { + + using U = typename std::iterator_traits<I>::value_type; + + __shared__ typename cudaBlockReduce<nt, U>::Storage shm; + + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + auto x = cuda_mem_to_reg_strided<nt, vt>( + input + tile.begin, tid, tile.count() + ); + + // reduce multiple values per thread into a scalar. + U s; + cuda_strided_iterate<nt, vt>( + [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count() + ); + // reduce to a scalar per block. + s = cudaBlockReduce<nt, U>()( + tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false + ); + + if(!tid) { + auto buf = static_cast<U*>(ptr); + (count <= nt*vt) ? *res = op(*res, s) : buf[bid] = s; + } +} + +/** @private */ +template <typename P, typename I, typename T, typename O> +void cuda_reduce_loop( + P&& p, I input, unsigned count, T* res, O op, void* ptr +) { + + using U = typename std::iterator_traits<I>::value_type; + using E = std::decay_t<P>; + + auto buf = static_cast<U*>(ptr); + auto B = E::num_blocks(count); + + cuda_reduce_kernel<E::nt, E::vt><<<B, E::nt, 0, p.stream()>>>( + input, count, res, op, ptr + ); + + if(B > 1) { + cuda_reduce_loop(p, buf, B, res, op, buf+B); + } +} + +// ---------------------------------------------------------------------------- +// cuda_uninitialized_reduce +// ---------------------------------------------------------------------------- + +/** +@private +*/ +template <size_t nt, size_t vt, typename I, typename T, typename O> +__global__ void cuda_uninitialized_reduce_kernel( + I input, unsigned count, T* res, O op, void* ptr +) { + + using U = typename std::iterator_traits<I>::value_type; + + __shared__ typename cudaBlockReduce<nt, U>::Storage shm; + + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + auto x = cuda_mem_to_reg_strided<nt, vt>( + input + tile.begin, tid, tile.count() + ); + + // reduce multiple values per thread into a scalar. + U s; + cuda_strided_iterate<nt, vt>( + [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count() + ); + + // reduce to a scalar per block. + s = cudaBlockReduce<nt, U>()( + tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false + ); + + if(!tid) { + auto buf = static_cast<U*>(ptr); + (count <= nt*vt) ? *res = s : buf[bid] = s; + } +} + +/** +@private +*/ +template <typename P, typename I, typename T, typename O> +void cuda_uninitialized_reduce_loop( + P&& p, I input, unsigned count, T* res, O op, void* ptr +) { + + using U = typename std::iterator_traits<I>::value_type; + using E = std::decay_t<P>; + + auto buf = static_cast<U*>(ptr); + auto B = (count + E::nv - 1) / E::nv; + + cuda_uninitialized_reduce_kernel<E::nt, E:: vt><<<B, E::nt, 0, p.stream()>>>( + input, count, res, op, buf + ); + + if(B > 1) { + cuda_uninitialized_reduce_loop(p, buf, B, res, op, buf+B); + } +} + +} // namespace tf::detail ---------------------------------------------------- + +namespace tf { + +// Function: reduce_bufsz +template <unsigned NT, unsigned VT> +template <typename T> +unsigned cudaExecutionPolicy<NT, VT>::reduce_bufsz(unsigned count) { + unsigned B = num_blocks(count); + unsigned n = 0; + while(B > 1) { + n += B; + B = num_blocks(B); + } + return n*sizeof(T); +} + +// ---------------------------------------------------------------------------- +// cuda_reduce +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous parallel reduction over a range of items + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements +@param buf pointer to the temporary buffer + +This method is equivalent to the parallel execution of the following loop on a GPU: + +@code{.cpp} +while (first != last) { + *result = op(*result, *first++); +} +@endcode + */ +template <typename P, typename I, typename T, typename O> +void cuda_reduce( + P&& p, I first, I last, T* res, O op, void* buf +) { + unsigned count = std::distance(first, last); + if(count == 0) { + return; + } + detail::cuda_reduce_loop(p, first, count, res, op, buf); +} + +// ---------------------------------------------------------------------------- +// cuda_uninitialized_reduce +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous parallel reduction over a range of items without + an initial value + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements +@param buf pointer to the temporary buffer + +This method is equivalent to the parallel execution of the following loop +on a GPU: + +@code{.cpp} +*result = *first++; // no initial values partitipcate in the loop +while (first != last) { + *result = op(*result, *first++); +} +@endcode +*/ +template <typename P, typename I, typename T, typename O> +void cuda_uninitialized_reduce( + P&& p, I first, I last, T* res, O op, void* buf +) { + unsigned count = std::distance(first, last); + if(count == 0) { + return; + } + detail::cuda_uninitialized_reduce_loop(p, first, count, res, op, buf); +} + +// ---------------------------------------------------------------------------- +// transform_reduce +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous parallel reduction over a range of transformed items + without an initial value + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type +@tparam U unary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param bop binary operator to apply to reduce elements +@param uop unary operator to apply to transform elements +@param buf pointer to the temporary buffer + +This method is equivalent to the parallel execution of the following loop on a GPU: + +@code{.cpp} +while (first != last) { + *result = bop(*result, uop(*first++)); +} +@endcode +*/ +template<typename P, typename I, typename T, typename O, typename U> +void cuda_transform_reduce( + P&& p, I first, I last, T* res, O bop, U uop, void* buf +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // reduction loop + detail::cuda_reduce_loop(p, + cuda_make_load_iterator<T>([=]__device__(auto i){ + return uop(*(first+i)); + }), + count, res, bop, buf + ); +} + +// ---------------------------------------------------------------------------- +// transform_uninitialized_reduce +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous parallel reduction over a range of transformed items + with an initial value + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type +@tparam U unary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param bop binary operator to apply to reduce elements +@param uop unary operator to apply to transform elements +@param buf pointer to the temporary buffer + +This method is equivalent to the parallel execution of the following loop +on a GPU: + +@code{.cpp} +*result = uop(*first++); // no initial values partitipcate in the loop +while (first != last) { + *result = bop(*result, uop(*first++)); +} +@endcode +*/ +template<typename P, typename I, typename T, typename O, typename U> +void cuda_uninitialized_transform_reduce( + P&& p, I first, I last, T* res, O bop, U uop, void* buf +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + detail::cuda_uninitialized_reduce_loop(p, + cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }), + count, res, bop, buf + ); +} + +// ---------------------------------------------------------------------------- + +//template <typename T, typename C> +//__device__ void cuda_warp_reduce( +// volatile T* shm, size_t N, size_t tid, C op +//) { +// if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]); +// if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]); +// if(tid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]); +// if(tid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]); +// if(tid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]); +// if(tid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]); +//} +// +//template <typename I, typename T, typename C, bool uninitialized> +//__global__ void cuda_reduce(I first, size_t N, T* res, C op) { +// +// size_t tid = threadIdx.x; +// +// if(tid >= N) { +// return; +// } +// +// cudaSharedMemory<T> shared_memory; +// T* shm = shared_memory.get(); +// +// shm[tid] = *(first+tid); +// +// for(size_t i=tid+blockDim.x; i<N; i+=blockDim.x) { +// shm[tid] = op(shm[tid], *(first+i)); +// } +// +// __syncthreads(); +// +// for(size_t s = blockDim.x / 2; s > 32; s >>= 1) { +// if(tid < s && tid + s < N) { +// shm[tid] = op(shm[tid], shm[tid+s]); +// } +// __syncthreads(); +// } +// +// if(tid < 32) { +// cuda_warp_reduce(shm, N, tid, op); +// } +// +// if(tid == 0) { +// if constexpr (uninitialized) { +// *res = shm[0]; +// } +// else { +// *res = op(*res, shm[0]); +// } +// } +//} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp b/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp new file mode 100644 index 0000000..bce0d63 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/scan.hpp @@ -0,0 +1,488 @@ +#pragma once + +#include "reduce.hpp" + +/** +@file taskflow/cuda/algorithm/scan.hpp +@brief CUDA scan algorithm include file +*/ + +namespace tf::detail { + +// ---------------------------------------------------------------------------- +// scan +// ---------------------------------------------------------------------------- + +/** @private */ +inline constexpr unsigned cudaScanRecursionThreshold = 8; + +/** @private */ +enum class cudaScanType : int { + EXCLUSIVE = 1, + INCLUSIVE +}; + +/** @private */ +template<typename T, unsigned vt = 0, bool is_array = (vt > 0)> +struct cudaScanResult { + T scan; + T reduction; +}; + +/** @private */ +template<typename T, unsigned vt> +struct cudaScanResult<T, vt, true> { + cudaArray<T, vt> scan; + T reduction; +}; + +//----------------------------------------------------------------------------- + +/** @private */ +template<unsigned nt, typename T> +struct cudaBlockScan { + + const static unsigned num_warps = nt / CUDA_WARP_SIZE; + const static unsigned num_passes = log2(nt); + const static unsigned capacity = nt + num_warps; + + /** @private */ + union storage_t { + T data[2 * nt]; + struct { T threads[nt], warps[num_warps]; }; + }; + + // standard scan + template<typename op_t> + __device__ cudaScanResult<T> operator ()( + unsigned tid, + T x, + storage_t& storage, + unsigned count = nt, + op_t op = op_t(), + T init = T(), + cudaScanType type = cudaScanType::EXCLUSIVE + ) const; + + // vectorized scan. accepts multiple values per thread and adds in + // optional global carry-in + template<unsigned vt, typename op_t> + __device__ cudaScanResult<T, vt> operator()( + unsigned tid, + cudaArray<T, vt> x, + storage_t& storage, + T carry_in = T(), + bool use_carry_in = false, + unsigned count = nt, + op_t op = op_t(), + T init = T(), + cudaScanType type = cudaScanType::EXCLUSIVE + ) const; +}; + +// standard scan +template <unsigned nt, typename T> +template<typename op_t> +__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () ( + unsigned tid, T x, storage_t& storage, unsigned count, op_t op, + T init, cudaScanType type +) const { + + unsigned first = 0; + storage.data[first + tid] = x; + __syncthreads(); + + cuda_iterate<num_passes>([&](auto pass) { + if(auto offset = 1<<pass; tid >= offset) { + x = op(storage.data[first + tid - offset], x); + } + first = nt - first; + storage.data[first + tid] = x; + __syncthreads(); + }); + + cudaScanResult<T> result; + result.reduction = storage.data[first + count - 1]; + result.scan = (tid < count) ? + (cudaScanType::INCLUSIVE == type ? x : + (tid ? storage.data[first + tid - 1] : init)) : + result.reduction; + __syncthreads(); + + return result; +} + +// vectorized scan block +template <unsigned nt, typename T> +template<unsigned vt, typename op_t> +__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()( + unsigned tid, + cudaArray<T, vt> x, + storage_t& storage, + T carry_in, + bool use_carry_in, + unsigned count, op_t op, + T init, + cudaScanType type +) const { + + // Start with an inclusive scan of the in-range elements. + if(count >= nt * vt) { + cuda_iterate<vt>([&](auto i) { + x[i] = i ? op(x[i], x[i - 1]) : x[i]; + }); + } else { + cuda_iterate<vt>([&](auto i) { + auto index = vt * tid + i; + x[i] = i ? + ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) : + (x[i] = (index < count) ? x[i] : init); + }); + } + + // Scan the thread-local reductions for a carry-in for each thread. + auto result = operator()( + tid, x[vt - 1], storage, + (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE + ); + + // Perform the scan downsweep and add both the global carry-in and the + // thread carry-in to the values. + if(use_carry_in) { + result.reduction = op(carry_in, result.reduction); + result.scan = tid ? op(carry_in, result.scan) : carry_in; + } else { + use_carry_in = tid > 0; + } + + cudaArray<T, vt> y; + cuda_iterate<vt>([&](auto i) { + if(cudaScanType::EXCLUSIVE == type) { + y[i] = i ? x[i - 1] : result.scan; + if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]); + } else { + y[i] = use_carry_in ? op(x[i], result.scan) : x[i]; + } + }); + + return cudaScanResult<T, vt> { y, result.reduction }; +} + +/** +@private +@brief single-pass scan for small input + */ +template <typename P, typename I, typename O, typename C> +void cuda_single_pass_scan( + P&& p, + cudaScanType scan_type, + I input, + unsigned count, + O output, + C op + //reduction_it reduction, +) { + + using T = typename std::iterator_traits<O>::value_type; + using E = std::decay_t<P>; + + // Small input specialization. This is the non-recursive branch. + cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + using scan_t = cudaBlockScan<E::nt, T>; + + __shared__ union { + typename scan_t::storage_t scan; + T values[E::nv]; + } shared; + + auto carry_in = T(); + for(unsigned cur = 0; cur < count; cur += E::nv) { + // Cooperatively load values into register. + auto count2 = min(count - cur, E::nv); + + auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur, + tid, count2, shared.values); + + auto result = scan_t()(tid, x, shared.scan, + carry_in, cur > 0, count2, op, T(), scan_type); + + // Store the scanned values back to global memory. + cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2, + output + cur, shared.values); + + // Roll the reduction into carry_in. + carry_in = result.reduction; + } + + // Store the carry-out to the reduction pointer. This may be a + // discard_iterator_t if no reduction is wanted. + //if(!tid) *reduction = carry_in; + }); +} + +/** +@private + +@brief main scan loop +*/ +template<typename P, typename I, typename O, typename C> +void cuda_scan_loop( + P&& p, + cudaScanType scan_type, + I input, + unsigned count, + O output, + C op, + //reduction_it reduction, + void* ptr +) { + + using E = std::decay_t<P>; + using T = typename std::iterator_traits<O>::value_type; + + T* buffer = static_cast<T*>(ptr); + + //launch_t::cta_dim(context).B(count); + unsigned B = (count + E::nv - 1) / E::nv; + + if(B > cudaScanRecursionThreshold) { + + //cudaDeviceVector<T> partials(B); + //auto buffer = partials.data(); + + // upsweep phase + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + __shared__ typename cudaBlockReduce<E::nt, T>::Storage shm; + + // Load the tile's data into register. + auto tile = cuda_get_tile(bid, E::nv, count); + auto x = cuda_mem_to_reg_strided<E::nt, E::vt>( + input + tile.begin, tid, tile.count() + ); + + // Reduce the thread's values into a scalar. + T scalar; + cuda_strided_iterate<E::nt, E::vt>( + [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; }, + tid, tile.count() + ); + + // Reduce across all threads. + auto all_reduce = cudaBlockReduce<E::nt, T>()( + tid, scalar, shm, tile.count(), op + ); + + // Store the final reduction to the partials. + if(!tid) { + buffer[bid] = all_reduce; + } + }); + + // recursively call scan + //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S); + cuda_scan_loop( + p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B + ); + + // downsweep: perform an intra-tile scan and add the scan of the partials + // as carry-in + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + using scan_t = cudaBlockScan<E::nt, T>; + + __shared__ union { + typename scan_t::storage_t scan; + T values[E::nv]; + } shared; + + // Load a tile to register in thread order. + auto tile = cuda_get_tile(bid, E::nv, count); + auto x = cuda_mem_to_reg_thread<E::nt, E::vt>( + input + tile.begin, tid, tile.count(), shared.values + ); + + // Scan the array with carry-in from the partials. + auto y = scan_t()(tid, x, shared.scan, + buffer[bid], bid > 0, tile.count(), op, T(), + scan_type).scan; + + // Store the scanned values to the output. + cuda_reg_to_mem_thread<E::nt, E::vt>( + y, tid, tile.count(), output + tile.begin, shared.values + ); + }); + } + // Small input specialization. This is the non-recursive branch. + else { + cuda_single_pass_scan(p, scan_type, input, count, output, op); + } +} + +} // namespace tf::detail ---------------------------------------------------- + +namespace tf { + +// Function: scan_bufsz +template <unsigned NT, unsigned VT> +template <typename T> +unsigned cudaExecutionPolicy<NT, VT>::scan_bufsz(unsigned count) { + unsigned B = num_blocks(count); + unsigned n = 0; + for(auto b=B; b>detail::cudaScanRecursionThreshold; b=num_blocks(b)) { + n += b; + } + return n*sizeof(T); +} + + +/** +@brief performs asynchronous inclusive scan over a range of items + +@tparam P execution policy type +@tparam I input iterator +@tparam O output iterator +@tparam C binary operator type + +@param p execution policy +@param first iterator to the beginning of the input range +@param last iterator to the end of the input range +@param output iterator to the beginning of the output range +@param op binary operator to apply to scan +@param buf pointer to the temporary buffer + +*/ +template<typename P, typename I, typename O, typename C> +void cuda_inclusive_scan( + P&& p, I first, I last, O output, C op, void* buf +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // launch the scan loop + detail::cuda_scan_loop( + p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf + ); +} + +/** +@brief performs asynchronous inclusive scan over a range of transformed items + +@tparam P execution policy type +@tparam I input iterator +@tparam O output iterator +@tparam C binary operator type +@tparam U unary operator type + +@param p execution policy +@param first iterator to the beginning of the input range +@param last iterator to the end of the input range +@param output iterator to the beginning of the output range +@param bop binary operator to apply to scan +@param uop unary operator to apply to transform each item before scan +@param buf pointer to the temporary buffer + +*/ +template<typename P, typename I, typename O, typename C, typename U> +void cuda_transform_inclusive_scan( + P&& p, I first, I last, O output, C bop, U uop, void* buf +) { + + using T = typename std::iterator_traits<O>::value_type; + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // launch the scan loop + detail::cuda_scan_loop( + p, detail::cudaScanType::INCLUSIVE, + cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }), + count, output, bop, buf + ); +} + +/** +@brief performs asynchronous exclusive scan over a range of items + +@tparam P execution policy type +@tparam I input iterator +@tparam O output iterator +@tparam C binary operator type + +@param p execution policy +@param first iterator to the beginning of the input range +@param last iterator to the end of the input range +@param output iterator to the beginning of the output range +@param op binary operator to apply to scan +@param buf pointer to the temporary buffer + +*/ +template<typename P, typename I, typename O, typename C> +void cuda_exclusive_scan( + P&& p, I first, I last, O output, C op, void* buf +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // launch the scan loop + detail::cuda_scan_loop( + p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf + ); +} + +/** +@brief performs asynchronous exclusive scan over a range of items + +@tparam P execution policy type +@tparam I input iterator +@tparam O output iterator +@tparam C binary operator type +@tparam U unary operator type + +@param p execution policy +@param first iterator to the beginning of the input range +@param last iterator to the end of the input range +@param output iterator to the beginning of the output range +@param bop binary operator to apply to scan +@param uop unary operator to apply to transform each item before scan +@param buf pointer to the temporary buffer + +*/ +template<typename P, typename I, typename O, typename C, typename U> +void cuda_transform_exclusive_scan( + P&& p, I first, I last, O output, C bop, U uop, void* buf +) { + + using T = typename std::iterator_traits<O>::value_type; + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // launch the scan loop + detail::cuda_scan_loop( + p, detail::cudaScanType::EXCLUSIVE, + cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }), + count, output, bop, buf + ); +} + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp b/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp new file mode 100644 index 0000000..3cc01d5 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/sort.hpp @@ -0,0 +1,506 @@ +#pragma once + +#include "merge.hpp" + +/** +@file taskflow/cuda/algorithm/sort.hpp +@brief CUDA sort algorithm include file +*/ + +namespace tf::detail { + +// ---------------------------------------------------------------------------- +// odd-even sort in register +// ---------------------------------------------------------------------------- + +/** +@private +@brief counts the number of leading zeros starting from the most significant bit +*/ +constexpr int cuda_clz(int x) { + for(int i = 31; i >= 0; --i) { + if((1<< i) & x) { + return 31 - i; + } + } + return 32; +} + +/** +@private +@brief finds log2(x) and optionally round up to the next integer logarithm. +*/ +constexpr int cuda_find_log2(int x, bool round_up = false) { + int a = 31 - cuda_clz(x); + if(round_up) { + a += !is_pow2(x); + } + return a; +} + +/** @private */ +template<typename T, unsigned vt, typename C> +__device__ auto cuda_odd_even_sort( + cudaArray<T, vt> x, C comp, int flags = 0 +) { + cuda_iterate<vt>([&](auto I) { + #pragma unroll + for(auto i = 1 & I; i < vt - 1; i += 2) { + if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i])) + cuda_swap(x[i], x[i + 1]); + } + }); + return x; +} + +/** @private */ +template<typename K, typename V, unsigned vt, typename C> +__device__ auto cuda_odd_even_sort( + cudaKVArray<K, V, vt> x, C comp, int flags = 0 +) { + cuda_iterate<vt>([&](auto I) { + #pragma unroll + for(auto i = 1 & I; i < vt - 1; i += 2) { + if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) { + cuda_swap(x.keys[i], x.keys[i + 1]); + cuda_swap(x.vals[i], x.vals[i + 1]); + } + } + }); + return x; +} + +// ---------------------------------------------------------------------------- +// range check +// ---------------------------------------------------------------------------- + +/** @private */ +__device__ inline int cuda_out_of_range_flags(int first, int vt, int count) { + int out_of_range = min(vt, first + vt - count); + int head_flags = 0; + if(out_of_range > 0) { + const int mask = (1<< vt) - 1; + head_flags = mask & (~mask>> out_of_range); + } + return head_flags; +} + +/** @private */ +__device__ inline auto cuda_compute_merge_sort_frame( + unsigned partition, unsigned coop, unsigned spacing +) { + + unsigned size = spacing * (coop / 2); + unsigned start = ~(coop - 1) & partition; + unsigned a_begin = spacing * start; + unsigned b_begin = spacing * start + size; + + return cudaMergeRange { + a_begin, + a_begin + size, + b_begin, + b_begin + size + }; +} + +/** @private */ +__device__ inline auto cuda_compute_merge_sort_range( + unsigned count, unsigned partition, unsigned coop, unsigned spacing +) { + + auto frame = cuda_compute_merge_sort_frame(partition, coop, spacing); + + return cudaMergeRange { + frame.a_begin, + min(count, frame.a_end), + min(count, frame.b_begin), + min(count, frame.b_end) + }; +} + +/** @private */ +__device__ inline auto cuda_compute_merge_sort_range( + unsigned count, unsigned partition, unsigned coop, unsigned spacing, + unsigned mp0, unsigned mp1 +) { + + auto range = cuda_compute_merge_sort_range(count, partition, coop, spacing); + + // Locate the diagonal from the start of the A sublist. + unsigned diag = spacing * partition - range.a_begin; + + // The end partition of the last cta for each merge operation is computed + // and stored as the begin partition for the subsequent merge. i.e. it is + // the same partition but in the wrong coordinate system, so its 0 when it + // should be listSize. Correct that by checking if this is the last cta + // in this merge operation. + if(coop - 1 != ((coop - 1) & partition)) { + range.a_end = range.a_begin + mp1; + range.b_end = min(count, range.b_begin + diag + spacing - mp1); + } + + range.a_begin = range.a_begin + mp0; + range.b_begin = min(count, range.b_begin + diag - mp0); + + return range; +} + +/** @private */ +template<unsigned nt, unsigned vt, typename K, typename V> +struct cudaBlockSort { + + static constexpr bool has_values = !std::is_same<V, cudaEmpty>::value; + static constexpr unsigned num_passes = log2(nt); + + /** @private */ + union Storage { + K keys[nt * vt + 1]; + V vals[nt * vt]; + }; + + static_assert(is_pow2(nt), "cudaBlockSort requires pow2 number of threads"); + + template<typename C> + __device__ auto merge_pass( + cudaKVArray<K, V, vt> x, + unsigned tid, unsigned count, unsigned pass, + C comp, Storage& storage + ) const { + + // Divide the CTA's keys into lists. + unsigned coop = 2 << pass; + auto range = cuda_compute_merge_sort_range(count, tid, coop, vt); + unsigned diag = vt * tid - range.a_begin; + + // Store the keys into shared memory for searching. + cuda_reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys); + + // Search for the merge path for this thread within its list. + auto mp = cuda_merge_path<cudaMergeBoundType::LOWER>( + storage.keys, range, diag, comp + ); + + // Run a serial merge and return. + auto merge = cuda_serial_merge<cudaMergeBoundType::LOWER, vt>( + storage.keys, range.partition(mp, diag), comp + ); + x.keys = merge.keys; + + if(has_values) { + // Reorder values through shared memory. + cuda_reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals); + x.vals = cuda_shared_gather<nt, vt>(storage.vals, merge.indices); + } + + return x; + } + + template<typename C> + __device__ auto block_sort(cudaKVArray<K, V, vt> x, + unsigned tid, unsigned count, C comp, Storage& storage + ) const { + + // Sort the inputs within each thread. If any threads have fewer than + // vt items, use the segmented sort network to prevent out-of-range + // elements from contaminating the sort. + if(count < nt * vt) { + auto head_flags = cuda_out_of_range_flags(vt * tid, vt, count); + x = cuda_odd_even_sort(x, comp, head_flags); + } else { + x = cuda_odd_even_sort(x, comp); + } + + // Merge threads starting with a pair until all values are merged. + for(unsigned pass = 0; pass < num_passes; ++pass) { + x = merge_pass(x, tid, count, pass, comp, storage); + } + + return x; + } +}; + +/** @private */ +template<typename P, typename K, typename C> +void cuda_merge_sort_partitions( + P&& p, K keys, unsigned count, + unsigned coop, unsigned spacing, C comp, unsigned* buf +) { + + // bufer size is num_partitions + 1 + unsigned num_partitions = (count + spacing - 1) / spacing + 1; + + const unsigned nt = 128; + const unsigned vt = 1; + const unsigned nv = nt * vt; + + unsigned B = (num_partitions + nv - 1) / nv; // nt = 128, vt = 1 + + cuda_kernel<<<B, nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + auto range = cuda_get_tile(bid, nt * vt, num_partitions); + cuda_strided_iterate<nt, vt>([=](auto, auto j) { + auto index = j + range.begin; + auto range = cuda_compute_merge_sort_range(count, index, coop, spacing); + auto diag = min(spacing * index, count) - range.a_begin; + buf[index] = cuda_merge_path<cudaMergeBoundType::LOWER>( + keys + range.a_begin, range.a_count(), + keys + range.b_begin, range.b_count(), + diag, comp + ); + }, tid, range.count()); + }); +} + +/** @private */ +template<typename P, typename K_it, typename V_it, typename C> +void merge_sort_loop( + P&& p, K_it keys_input, V_it vals_input, unsigned count, C comp, void* buf +) { + + using K = typename std::iterator_traits<K_it>::value_type; + using V = typename std::iterator_traits<V_it>::value_type; + using E = std::decay_t<P>; + + const bool has_values = !std::is_same<V, cudaEmpty>::value; + + unsigned B = (count + E::nv - 1) / E::nv; + unsigned R = cuda_find_log2(B, true); + + K* keys_output {nullptr}; + V* vals_output {nullptr}; + unsigned *mp_data {nullptr}; + + if(R) { + keys_output = (K*)(buf); + if(has_values) { + vals_output = (V*)(keys_output + count); + mp_data = (unsigned*)(vals_output + count); + } + else { + mp_data = (unsigned*)(keys_output + count); + } + } + + //cudaDeviceVector<K> keys_temp(R ? count : 0); + //auto keys_output = keys_temp.data(); + ////std::cout << "keys_output = " << keys_temp.size()*sizeof(K) << std::endl; + + //cudaDeviceVector<V> vals_temp((has_values && R) ? count : 0); + //auto vals_output = vals_temp.data(); + //std::cout << "vals_output = " << vals_temp.size()*sizeof(V) << std::endl; + + auto keys_blocksort = (1 & R) ? keys_output : keys_input; + auto vals_blocksort = (1 & R) ? vals_output : vals_input; + + //printf("B=%u, R=%u\n", B, R); + + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { + + using sort_t = cudaBlockSort<E::nt, E::vt, K, V>; + + __shared__ union { + typename sort_t::Storage sort; + K keys[E::nv]; + V vals[E::nv]; + } shared; + + auto tile = cuda_get_tile(bid, E::nv, count); + + // Load the keys and values. + cudaKVArray<K, V, E::vt> unsorted; + unsorted.keys = cuda_mem_to_reg_thread<E::nt, E::vt>( + keys_input + tile.begin, tid, tile.count(), shared.keys + ); + + if(has_values) { + unsorted.vals = cuda_mem_to_reg_thread<E::nt, E::vt>( + vals_input + tile.begin, tid, tile.count(), shared.vals + ); + } + + // Blocksort. + auto sorted = sort_t().block_sort(unsorted, tid, tile.count(), comp, shared.sort); + + // Store the keys and values. + cuda_reg_to_mem_thread<E::nt, E::vt>( + sorted.keys, tid, tile.count(), keys_blocksort + tile.begin, shared.keys + ); + + if(has_values) { + cuda_reg_to_mem_thread<E::nt, E::vt>( + sorted.vals, tid, tile.count(), vals_blocksort + tile.begin, shared.vals + ); + } + }); + + // merge passes + + if(1 & R) { + std::swap(keys_input, keys_output); + std::swap(vals_input, vals_output); + } + + // number of partitions + //unsigned num_partitions = B + 1; + //cudaDeviceVector<unsigned> mem(num_partitions); + //auto mp_data = mem.data(); + //std::cout << "num_partitions = " << (B+1)*sizeof(unsigned) << std::endl; + + for(unsigned pass = 0; pass < R; ++pass) { + + unsigned coop = 2 << pass; + + cuda_merge_sort_partitions( + p, keys_input, count, coop, E::nv, comp, mp_data + ); + + cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) { + + __shared__ union { + K keys[E::nv + 1]; + unsigned indices[E::nv]; + } shared; + + auto tile = cuda_get_tile(bid, E::nv, count); + + // Load the range for this CTA and merge the values into register. + auto range = cuda_compute_merge_sort_range( + count, bid, coop, E::nv, mp_data[bid + 0], mp_data[bid + 1] + ); + + auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>( + keys_input, keys_input, range, tid, comp, shared.keys + ); + + // Store merged values back out. + cuda_reg_to_mem_thread<E::nt>( + merge.keys, tid, tile.count(), keys_output + tile.begin, shared.keys + ); + + if(has_values) { + // Transpose the indices from thread order to strided order. + auto indices = cuda_reg_thread_to_strided<E::nt>( + merge.indices, tid, shared.indices + ); + + // Gather the input values and merge into the output values. + cuda_transfer_two_streams_strided<E::nt>( + vals_input + range.a_begin, range.a_count(), + vals_input + range.b_begin, range.b_count(), + indices, tid, vals_output + tile.begin + ); + } + }); + + std::swap(keys_input, keys_output); + std::swap(vals_input, vals_output); + } +} + +} // end of namespace tf::detail --------------------------------------------- + +namespace tf { + +/** +@brief queries the buffer size in bytes needed to call sort kernels + for the given number of elements + +@tparam P execution policy type +@tparam K key type +@tparam V value type (default tf::cudaEmpty) + +@param count number of keys/values to sort + +The function is used to allocate a buffer for calling tf::cuda_sort. + +*/ +template <typename P, typename K, typename V = cudaEmpty> +unsigned cuda_sort_buffer_size(unsigned count) { + + using E = std::decay_t<P>; + + const bool has_values = !std::is_same<V, cudaEmpty>::value; + + unsigned B = (count + E::nv - 1) / E::nv; + unsigned R = detail::cuda_find_log2(B, true); + + return R ? (count * sizeof(K) + (has_values ? count*sizeof(V) : 0) + + (B+1)*sizeof(unsigned)) : 0; +} + +// ---------------------------------------------------------------------------- +// key-value sort +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous key-value sort on a range of items + +@tparam P execution policy type +@tparam K_it key iterator type +@tparam V_it value iterator type +@tparam C comparator type + +@param p execution policy +@param k_first iterator to the beginning of the key range +@param k_last iterator to the end of the key range +@param v_first iterator to the beginning of the value range +@param comp binary comparator +@param buf pointer to the temporary buffer + +Sorts key-value elements in <tt>[k_first, k_last)</tt> and +<tt>[v_first, v_first + (k_last - k_first))</tt> into ascending key order +using the given comparator @c comp. +If @c i and @c j are any two valid iterators in <tt>[k_first, k_last)</tt> +such that @c i precedes @c j, and @c p and @c q are iterators in +<tt>[v_first, v_first + (k_last - k_first))</tt> corresponding to +@c i and @c j respectively, then <tt>comp(*j, *i)</tt> evaluates to @c false. + +For example, assume: + + @c keys are <tt>{1, 4, 2, 8, 5, 7}</tt> + + @c values are <tt>{'a', 'b', 'c', 'd', 'e', 'f'}</tt> + +After sort: + + @c keys are <tt>{1, 2, 4, 5, 7, 8}</tt> + + @c values are <tt>{'a', 'c', 'b', 'e', 'f', 'd'}</tt> + +*/ +template<typename P, typename K_it, typename V_it, typename C> +void cuda_sort_by_key( + P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf +) { + + unsigned N = std::distance(k_first, k_last); + + if(N <= 1) { + return; + } + + detail::merge_sort_loop(p, k_first, v_first, N, comp, buf); +} + +// ---------------------------------------------------------------------------- +// key sort +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous key-only sort on a range of items + +@tparam P execution policy type +@tparam K_it key iterator type +@tparam C comparator type + +@param p execution policy +@param k_first iterator to the beginning of the key range +@param k_last iterator to the end of the key range +@param comp binary comparator +@param buf pointer to the temporary buffer + +This method is equivalent to tf::cuda_sort_by_key without values. + +*/ +template<typename P, typename K_it, typename C> +void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf) { + cuda_sort_by_key(p, k_first, k_last, (cudaEmpty*)nullptr, comp, buf); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp b/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp new file mode 100644 index 0000000..b1146bd --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/transform.hpp @@ -0,0 +1,282 @@ +#pragma once + +#include "../cudaflow.hpp" + +/** +@file taskflow/cuda/algorithm/transform.hpp +@brief cuda parallel-transform algorithms include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// transform +// ---------------------------------------------------------------------------- + +namespace detail { + +/** +@private +*/ +template <size_t nt, size_t vt, typename I, typename O, typename C> +__global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) { + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + cuda_strided_iterate<nt, vt>( + [=]__device__(auto, auto j) { + auto offset = j + tile.begin; + *(output + offset) = op(*(first+offset)); + }, + tid, + tile.count() + ); +} + +/** +@private +*/ +template <size_t nt, size_t vt, typename I1, typename I2, typename O, typename C> +__global__ void cuda_transform_kernel( + I1 first1, I2 first2, unsigned count, O output, C op +) { + auto tid = threadIdx.x; + auto bid = blockIdx.x; + auto tile = cuda_get_tile(bid, nt*vt, count); + cuda_strided_iterate<nt, vt>( + [=]__device__(auto, auto j) { + auto offset = j + tile.begin; + *(output + offset) = op(*(first1+offset), *(first2+offset)); + }, + tid, + tile.count() + ); +} + +} // end of namespace detail ------------------------------------------------- + +// ---------------------------------------------------------------------------- +// CUDA standard algorithms: transform +// ---------------------------------------------------------------------------- + +/** +@brief performs asynchronous parallel transforms over a range of items + +@tparam P execution policy type +@tparam I input iterator type +@tparam O output iterator type +@tparam C unary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param output iterator to the beginning of the output range +@param op unary operator to apply to transform each item + +This method is equivalent to the parallel execution of the following loop on a GPU: + +@code{.cpp} +while (first != last) { + *output++ = op(*first++); +} +@endcode + +*/ +template <typename P, typename I, typename O, typename C> +void cuda_transform(P&& p, I first, I last, O output, C op) { + + using E = std::decay_t<P>; + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + detail::cuda_transform_kernel<E::nt, E::vt, I, O, C> + <<<E::num_blocks(count), E::nt, 0, p.stream()>>> ( + first, count, output, op + ); +} + +/** +@brief performs asynchronous parallel transforms over two ranges of items + +@tparam P execution policy type +@tparam I1 first input iterator type +@tparam I2 second input iterator type +@tparam O output iterator type +@tparam C binary operator type + +@param p execution policy +@param first1 iterator to the beginning of the first range +@param last1 iterator to the end of the first range +@param first2 iterator to the beginning of the second range +@param output iterator to the beginning of the output range +@param op binary operator to apply to transform each pair of items + +This method is equivalent to the parallel execution of the following loop on a GPU: + +@code{.cpp} +while (first1 != last1) { + *output++ = op(*first1++, *first2++); +} +@endcode +*/ +template <typename P, typename I1, typename I2, typename O, typename C> +void cuda_transform( + P&& p, I1 first1, I1 last1, I2 first2, O output, C op +) { + + using E = std::decay_t<P>; + + unsigned count = std::distance(first1, last1); + + if(count == 0) { + return; + } + + detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C> + <<<E::num_blocks(count), E::nt, 0, p.stream()>>> ( + first1, first2, count, output, op + ); +} + +// ---------------------------------------------------------------------------- +// cudaFlow +// ---------------------------------------------------------------------------- + +// Function: transform +template <typename I, typename O, typename C> +cudaTask cudaFlow::transform(I first, I last, O output, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first, last); + + // TODO: + //if(count == 0) { + // return; + //} + + return kernel( + E::num_blocks(count), E::nt, 0, + detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>, + first, count, output, c + ); +} + +// Function: transform +template <typename I1, typename I2, typename O, typename C> +cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first1, last1); + + // TODO: + //if(count == 0) { + // return; + //} + + return kernel( + E::num_blocks(count), E::nt, 0, + detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>, + first1, first2, count, output, c + ); +} + +// Function: update transform +template <typename I, typename O, typename C> +void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) { + + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first, last); + + // TODO: + //if(count == 0) { + // return; + //} + + kernel(task, + E::num_blocks(count), E::nt, 0, + detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>, + first, count, output, c + ); +} + +// Function: update transform +template <typename I1, typename I2, typename O, typename C> +void cudaFlow::transform( + cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c +) { + using E = cudaDefaultExecutionPolicy; + + unsigned count = std::distance(first1, last1); + + // TODO: + //if(count == 0) { + // return; + //} + + kernel(task, + E::num_blocks(count), E::nt, 0, + detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>, + first1, first2, count, output, c + ); +} + +// ---------------------------------------------------------------------------- +// cudaFlowCapturer +// ---------------------------------------------------------------------------- + +// Function: transform +template <typename I, typename O, typename C> +cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) { + return on([=](cudaStream_t stream) mutable { + cudaDefaultExecutionPolicy p(stream); + cuda_transform(p, first, last, output, op); + }); +} + +// Function: transform +template <typename I1, typename I2, typename O, typename C> +cudaTask cudaFlowCapturer::transform( + I1 first1, I1 last1, I2 first2, O output, C op +) { + return on([=](cudaStream_t stream) mutable { + cudaDefaultExecutionPolicy p(stream); + cuda_transform(p, first1, last1, first2, output, op); + }); +} + +// Function: transform +template <typename I, typename O, typename C> +void cudaFlowCapturer::transform( + cudaTask task, I first, I last, O output, C op +) { + on(task, [=] (cudaStream_t stream) mutable { + cudaDefaultExecutionPolicy p(stream); + cuda_transform(p, first, last, output, op); + }); +} + +// Function: transform +template <typename I1, typename I2, typename O, typename C> +void cudaFlowCapturer::transform( + cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op +) { + on(task, [=] (cudaStream_t stream) mutable { + cudaDefaultExecutionPolicy p(stream); + cuda_transform(p, first1, last1, first2, output, op); + }); +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp b/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp new file mode 100644 index 0000000..3b02a7f --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/algorithm/transpose.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// row-wise matrix transpose +// ---------------------------------------------------------------------------- +// +template <typename T> +__global__ void cuda_transpose( + const T* d_in, + T* d_out, + size_t rows, + size_t cols +) { + __shared__ T tile[32][32]; + size_t x = blockIdx.x * 32 + threadIdx.x; + size_t y = blockIdx.y * 32 + threadIdx.y; + + for(size_t i = 0; i < 32; i += 8) { + if(x < cols && (y + i) < rows) { + tile[threadIdx.y + i][threadIdx.x] = d_in[(y + i) * cols + x]; + } + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; + y = blockIdx.x * 32 + threadIdx.y; + + for(size_t i = 0; i < 32; i += 8) { + if(x < rows && (y + i) < cols) { + d_out[(y + i) * rows + x] = tile[threadIdx.x][threadIdx.y + i]; + } + } +} + +} // end of namespace -------------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp b/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp new file mode 100644 index 0000000..3b5daee --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_capturer.hpp @@ -0,0 +1,724 @@ +#pragma once + +#include "cuda_task.hpp" +#include "cuda_optimizer.hpp" + +/** +@file cuda_capturer.hpp +@brief %cudaFlow capturer include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowCapturer +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowCapturer + +@brief class to create a %cudaFlow graph using stream capture + +The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can +call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous +CUDA operations through the given stream. +The following example creates a CUDA graph that captures two kernel tasks, +@c task_1 and @c task_2, where @c task_1 runs before @c task_2. + +@code{.cpp} +taskflow.emplace([](tf::cudaFlowCapturer& capturer){ + + // capture my_kernel_1 through the given stream managed by the capturer + auto task_1 = capturer.on([&](cudaStream_t stream){ + my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1); + }); + + // capture my_kernel_2 through the given stream managed by the capturer + auto task_2 = capturer.on([&](cudaStream_t stream){ + my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2); + }); + + task_1.precede(task_2); +}); +@endcode + +Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task) +created from tf::Taskflow +and will be run by @em one worker thread in the executor. +That is, the callable that describes a %cudaFlowCapturer +will be executed sequentially. +Inside a %cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run +in parallel depending on the selected optimization algorithm. +By default, we use tf::cudaFlowRoundRobinOptimizer to transform a user-level +graph into a native CUDA graph. + +Please refer to @ref GPUTaskingcudaFlowCapturer for details. +*/ +class cudaFlowCapturer { + + friend class cudaFlow; + friend class Executor; + + // created by user + struct External { + cudaFlowGraph graph; + }; + + // created from cudaFlow + struct Internal { + }; + + using handle_t = std::variant<External, Internal>; + + using Optimizer = std::variant< + cudaFlowRoundRobinOptimizer, + cudaFlowSequentialOptimizer, + cudaFlowLinearOptimizer + >; + + public: + + /** + @brief constrcts a standalone cudaFlowCapturer + + A standalone %cudaFlow capturer does not go through any taskflow and + can be run by the caller thread using tf::cudaFlowCapturer::run. + */ + cudaFlowCapturer() = default; + + /** + @brief destructs the cudaFlowCapturer + */ + ~cudaFlowCapturer() = default; + + /** + @brief default move constructor + */ + cudaFlowCapturer(cudaFlowCapturer&&) = default; + + /** + @brief default move assignment operator + */ + cudaFlowCapturer& operator = (cudaFlowCapturer&&) = default; + + /** + @brief queries the emptiness of the graph + */ + bool empty() const; + + /** + @brief queries the number of tasks + */ + size_t num_tasks() const; + + /** + @brief clear this %cudaFlow capturer + */ + void clear(); + + /** + @brief dumps the %cudaFlow graph into a DOT format through an + output stream + */ + void dump(std::ostream& os) const; + + /** + @brief dumps the native captured graph into a DOT format through + an output stream + */ + void dump_native_graph(std::ostream& os) const; + + // ------------------------------------------------------------------------ + // basic methods + // ------------------------------------------------------------------------ + + /** + @brief captures a sequential CUDA operations from the given callable + + @tparam C callable type constructible with @c std::function<void(cudaStream_t)> + @param callable a callable to capture CUDA operations with the stream + + This methods applies a stream created by the flow to capture + a sequence of CUDA operations defined in the callable. + */ + template <typename C, std::enable_if_t< + std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr + > + cudaTask on(C&& callable); + + /** + @brief updates a capture task to another sequential CUDA operations + + The method is similar to cudaFlowCapturer::on but operates + on an existing task. + */ + template <typename C, std::enable_if_t< + std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr + > + void on(cudaTask task, C&& callable); + + /** + @brief captures a no-operation task + + @return a tf::cudaTask handle + + An empty node performs no operation during execution, + but can be used for transitive ordering. + For example, a phased execution graph with 2 groups of @c n nodes + with a barrier between them can be represented using an empty node + and @c 2*n dependency edges, + rather than no empty node and @c n^2 dependency edges. + */ + cudaTask noop(); + + /** + @brief updates a task to a no-operation task + + The method is similar to tf::cudaFlowCapturer::noop but + operates on an existing task. + */ + void noop(cudaTask task); + + /** + @brief copies data between host and device asynchronously through a stream + + @param dst destination memory address + @param src source memory address + @param count size in bytes to copy + + The method captures a @c cudaMemcpyAsync operation through an + internal stream. + */ + cudaTask memcpy(void* dst, const void* src, size_t count); + + /** + @brief updates a capture task to a memcpy operation + + The method is similar to cudaFlowCapturer::memcpy but operates on an + existing task. + */ + void memcpy(cudaTask task, void* dst, const void* src, size_t count); + + /** + @brief captures a copy task of typed data + + @tparam T element type (non-void) + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param num number of elements to copy + + @return cudaTask handle + + A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + cudaTask copy(T* tgt, const T* src, size_t num); + + /** + @brief updates a capture task to a copy operation + + The method is similar to cudaFlowCapturer::copy but operates on + an existing task. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + void copy(cudaTask task, T* tgt, const T* src, size_t num); + + /** + @brief initializes or sets GPU memory to the given value byte by byte + + @param ptr pointer to GPU mempry + @param v value to set for each byte of the specified memory + @param n size in bytes to set + + The method captures a @c cudaMemsetAsync operation through an + internal stream to fill the first @c count bytes of the memory area + pointed to by @c devPtr with the constant byte value @c value. + */ + cudaTask memset(void* ptr, int v, size_t n); + + /** + @brief updates a capture task to a memset operation + + The method is similar to cudaFlowCapturer::memset but operates on + an existing task. + */ + void memset(cudaTask task, void* ptr, int value, size_t n); + + /** + @brief captures a kernel + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return cudaTask handle + */ + template <typename F, typename... ArgsT> + cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args); + + /** + @brief updates a capture task to a kernel operation + + The method is similar to cudaFlowCapturer::kernel but operates on + an existing task. + */ + template <typename F, typename... ArgsT> + void kernel( + cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args + ); + + // ------------------------------------------------------------------------ + // generic algorithms + // ------------------------------------------------------------------------ + + /** + @brief capturers a kernel to runs the given callable with only one thread + + @tparam C callable type + + @param c callable to run by a single kernel thread + */ + template <typename C> + cudaTask single_task(C c); + + /** + @brief updates a capture task to a single-threaded kernel + + This method is similar to cudaFlowCapturer::single_task but operates + on an existing task. + */ + template <typename C> + void single_task(cudaTask task, C c); + + /** + @brief captures a kernel that applies a callable to each dereferenced element + of the data array + + @tparam I iterator type + @tparam C callable type + + @param first iterator to the beginning + @param last iterator to the end + @param callable a callable object to apply to the dereferenced iterator + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + for(auto itr = first; itr != last; i++) { + callable(*itr); + } + @endcode + */ + template <typename I, typename C> + cudaTask for_each(I first, I last, C callable); + + /** + @brief updates a capture task to a for-each kernel task + + This method is similar to cudaFlowCapturer::for_each but operates + on an existing task. + */ + template <typename I, typename C> + void for_each(cudaTask task, I first, I last, C callable); + + /** + @brief captures a kernel that applies a callable to each index in the range + with the step size + + @tparam I index type + @tparam C callable type + + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + // step is positive [first, last) + for(auto i=first; i<last; i+=step) { + callable(i); + } + + // step is negative [first, last) + for(auto i=first; i>last; i+=step) { + callable(i); + } + @endcode + */ + template <typename I, typename C> + cudaTask for_each_index(I first, I last, I step, C callable); + + /** + @brief updates a capture task to a for-each-index kernel task + + This method is similar to cudaFlowCapturer::for_each_index but operates + on an existing task. + */ + template <typename I, typename C> + void for_each_index( + cudaTask task, I first, I last, I step, C callable + ); + + /** + @brief captures a kernel that transforms an input range to an output range + + @tparam I input iterator type + @tparam O output iterator type + @tparam C unary operator type + + @param first iterator to the beginning of the input range + @param last iterator to the end of the input range + @param output iterator to the beginning of the output range + @param op unary operator to apply to transform each item in the range + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *output++ = op(*first++); + } + @endcode + */ + template <typename I, typename O, typename C> + cudaTask transform(I first, I last, O output, C op); + + /** + @brief updates a capture task to a transform kernel task + + This method is similar to cudaFlowCapturer::transform but operates + on an existing task. + */ + template <typename I, typename O, typename C> + void transform(cudaTask task, I first, I last, O output, C op); + + /** + @brief captures a kernel that transforms two input ranges to an output range + + @tparam I1 first input iterator type + @tparam I2 second input iterator type + @tparam O output iterator type + @tparam C unary operator type + + @param first1 iterator to the beginning of the input range + @param last1 iterator to the end of the input range + @param first2 iterato + @param output iterator to the beginning of the output range + @param op binary operator to apply to transform each pair of items in the + two input ranges + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first1 != last1) { + *output++ = op(*first1++, *first2++); + } + @endcode + */ + template <typename I1, typename I2, typename O, typename C> + cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op); + + /** + @brief updates a capture task to a transform kernel task + + This method is similar to cudaFlowCapturer::transform but operates + on an existing task. + */ + template <typename I1, typename I2, typename O, typename C> + void transform( + cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op + ); + + // ------------------------------------------------------------------------ + // Capturing methods + // ------------------------------------------------------------------------ + + /** + @brief selects a different optimization algorithm + + @tparam OPT optimizer type + @tparam ArgsT arguments types + + @param args arguments to forward to construct the optimizer + + @return a reference to the optimizer + + We currently supports the following optimization algorithms to capture + a user-described %cudaFlow: + + tf::cudaFlowSequentialOptimizer + + tf::cudaFlowRoundRobinOptimizer + + tf::cudaFlowLinearOptimizer + + By default, tf::cudaFlowCapturer uses the round-robin optimization + algorithm with four streams to transform a user-level graph into + a native CUDA graph. + */ + template <typename OPT, typename... ArgsT> + OPT& make_optimizer(ArgsT&&... args); + + /** + @brief captures the cudaFlow and turns it into a CUDA Graph + */ + cudaGraph_t capture(); + + // ------------------------------------------------------------------------ + // offload methods + // ------------------------------------------------------------------------ + + /** + @brief offloads the %cudaFlowCapturer onto a GPU asynchronously via a stream + + @param stream stream for performing this operation + + Offloads the present %cudaFlowCapturer onto a GPU asynchronously via + the given stream. + + An offloaded %cudaFlowCapturer forces the underlying graph to be instantiated. + After the instantiation, you should not modify the graph topology + but update node parameters. + */ + void run(cudaStream_t stream); + + /** + @brief acquires a reference to the underlying CUDA graph + */ + cudaGraph_t native_graph(); + + /** + @brief acquires a reference to the underlying CUDA graph executable + */ + cudaGraphExec_t native_executable(); + + private: + + cudaFlowGraph _cfg; + + Optimizer _optimizer; + + cudaGraphExec _exe {nullptr}; +}; + +// Function: empty +inline bool cudaFlowCapturer::empty() const { + return _cfg.empty(); +} + +// Function: num_tasks +inline size_t cudaFlowCapturer::num_tasks() const { + return _cfg._nodes.size(); +} + +// Procedure: clear +inline void cudaFlowCapturer::clear() { + _exe.clear(); + _cfg.clear(); +} + +// Procedure: dump +inline void cudaFlowCapturer::dump(std::ostream& os) const { + _cfg.dump(os, nullptr, ""); +} + +// Procedure: dump_native_graph +inline void cudaFlowCapturer::dump_native_graph(std::ostream& os) const { + cuda_dump_graph(os, _cfg._native_handle); +} + +// Function: capture +template <typename C, std::enable_if_t< + std::is_invocable_r_v<void, C, cudaStream_t>, void>* +> +cudaTask cudaFlowCapturer::on(C&& callable) { + auto node = _cfg.emplace_back(_cfg, + std::in_place_type_t<cudaFlowNode::Capture>{}, std::forward<C>(callable) + ); + return cudaTask(node); +} + +// Function: noop +inline cudaTask cudaFlowCapturer::noop() { + return on([](cudaStream_t){}); +} + +// Function: noop +inline void cudaFlowCapturer::noop(cudaTask task) { + on(task, [](cudaStream_t){}); +} + +// Function: memcpy +inline cudaTask cudaFlowCapturer::memcpy( + void* dst, const void* src, size_t count +) { + return on([dst, src, count] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to capture memcpy" + ); + }); +} + +// Function: copy +template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*> +cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) { + return on([tgt, src, num] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), + "failed to capture copy" + ); + }); +} + +// Function: memset +inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) { + return on([ptr, v, n] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" + ); + }); +} + +// Function: kernel +template <typename F, typename... ArgsT> +cudaTask cudaFlowCapturer::kernel( + dim3 g, dim3 b, size_t s, F f, ArgsT&&... args +) { + return on([g, b, s, f, args...] (cudaStream_t stream) mutable { + f<<<g, b, s, stream>>>(args...); + }); +} + +// Function: capture +inline cudaGraph_t cudaFlowCapturer::capture() { + return std::visit( + [this](auto&& opt){ return opt._optimize(_cfg); }, _optimizer + ); +} + +// Procedure: run +inline void cudaFlowCapturer::run(cudaStream_t stream) { + + // If the topology got changed, we need to destroy the executable + // and create a new one + if(_cfg._state & cudaFlowGraph::CHANGED) { + _cfg._native_handle.reset(capture()); + _exe.instantiate(_cfg._native_handle); + } + // if the graph is just updated (i.e., topology does not change), + // we can skip part of the optimization and just update the executable + // with the new captured graph + else if(_cfg._state & cudaFlowGraph::UPDATED) { + // TODO: skip part of the optimization (e.g., levelization) + _cfg._native_handle.reset(capture()); + if(_exe.update(_cfg._native_handle) != cudaGraphExecUpdateSuccess) { + _exe.instantiate(_cfg._native_handle); + } + } + + // run the executable (should exist) + _exe.launch(stream); + + _cfg._state = cudaFlowGraph::OFFLOADED; +} + +// Function: native_graph +inline cudaGraph_t cudaFlowCapturer::native_graph() { + return _cfg._native_handle; +} + +// Function: native_executable +inline cudaGraphExec_t cudaFlowCapturer::native_executable() { + return _exe; +} + +// Function: on +template <typename C, std::enable_if_t< + std::is_invocable_r_v<void, C, cudaStream_t>, void>* +> +void cudaFlowCapturer::on(cudaTask task, C&& callable) { + + if(task.type() != cudaTaskType::CAPTURE) { + TF_THROW("invalid cudaTask type (must be CAPTURE)"); + } + + _cfg._state |= cudaFlowGraph::UPDATED; + + std::get_if<cudaFlowNode::Capture>(&task._node->_handle)->work = + std::forward<C>(callable); +} + +// Function: memcpy +inline void cudaFlowCapturer::memcpy( + cudaTask task, void* dst, const void* src, size_t count +) { + on(task, [dst, src, count](cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to capture memcpy" + ); + }); +} + +// Function: copy +template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* +> +void cudaFlowCapturer::copy( + cudaTask task, T* tgt, const T* src, size_t num +) { + on(task, [tgt, src, num] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), + "failed to capture copy" + ); + }); +} + +// Function: memset +inline void cudaFlowCapturer::memset( + cudaTask task, void* ptr, int v, size_t n +) { + on(task, [ptr, v, n] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" + ); + }); +} + +// Function: kernel +template <typename F, typename... ArgsT> +void cudaFlowCapturer::kernel( + cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args +) { + on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable { + f<<<g, b, s, stream>>>(args...); + }); +} + +// Function: make_optimizer +template <typename OPT, typename ...ArgsT> +OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) { + return _optimizer.emplace<OPT>(std::forward<ArgsT>(args)...); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/cuda/cuda_device.hpp b/myxpcs/include/taskflow_/cuda/cuda_device.hpp new file mode 100644 index 0000000..016b2a6 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_device.hpp @@ -0,0 +1,342 @@ +#pragma once + +#include "cuda_error.hpp" + +/** +@file cuda_device.hpp +@brief CUDA device utilities include file +*/ + +namespace tf { + +/** +@brief queries the number of available devices +*/ +inline size_t cuda_get_num_devices() { + int N = 0; + TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count"); + return static_cast<size_t>(N); +} + +/** +@brief gets the current device associated with the caller thread +*/ +inline int cuda_get_device() { + int id; + TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id"); + return id; +} + +/** +@brief switches to a given device context +*/ +inline void cuda_set_device(int id) { + TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id); +} + +/** +@brief obtains the device property +*/ +inline void cuda_get_device_property(int i, cudaDeviceProp& p) { + TF_CHECK_CUDA( + cudaGetDeviceProperties(&p, i), "failed to get property of device ", i + ); +} + +/** +@brief obtains the device property +*/ +inline cudaDeviceProp cuda_get_device_property(int i) { + cudaDeviceProp p; + TF_CHECK_CUDA( + cudaGetDeviceProperties(&p, i), "failed to get property of device ", i + ); + return p; +} + +/** +@brief dumps the device property +*/ +inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) { + + os << "Major revision number: " << p.major << '\n' + << "Minor revision number: " << p.minor << '\n' + << "Name: " << p.name << '\n' + << "Total global memory: " << p.totalGlobalMem << '\n' + << "Total shared memory per block: " << p.sharedMemPerBlock << '\n' + << "Total registers per block: " << p.regsPerBlock << '\n' + << "Warp size: " << p.warpSize << '\n' + << "Maximum memory pitch: " << p.memPitch << '\n' + << "Maximum threads per block: " << p.maxThreadsPerBlock << '\n'; + + os << "Maximum dimension of block: "; + for (int i = 0; i < 3; ++i) { + if(i) os << 'x'; + os << p.maxThreadsDim[i]; + } + os << '\n'; + + os << "Maximum dimenstion of grid: "; + for (int i = 0; i < 3; ++i) { + if(i) os << 'x'; + os << p.maxGridSize[i];; + } + os << '\n'; + + os << "Clock rate: " << p.clockRate << '\n' + << "Total constant memory: " << p.totalConstMem << '\n' + << "Texture alignment: " << p.textureAlignment << '\n' + << "Concurrent copy and execution: " << p.deviceOverlap << '\n' + << "Number of multiprocessors: " << p.multiProcessorCount << '\n' + << "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n' + << "GPU sharing Host Memory: " << p.integrated << '\n' + << "Host page-locked mem mapping: " << p.canMapHostMemory << '\n' + << "Alignment for Surfaces: " << p.surfaceAlignment << '\n' + << "Device has ECC support: " << p.ECCEnabled << '\n' + << "Unified Addressing (UVA): " << p.unifiedAddressing << '\n'; +} + +/** +@brief queries the maximum threads per block on a device +*/ +inline size_t cuda_get_device_max_threads_per_block(int d) { + int threads = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d), + "failed to query the maximum threads per block on device ", d + ) + return threads; +} + +/** +@brief queries the maximum x-dimension per block on a device +*/ +inline size_t cuda_get_device_max_x_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d), + "failed to query the maximum x-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum y-dimension per block on a device +*/ +inline size_t cuda_get_device_max_y_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d), + "failed to query the maximum y-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum z-dimension per block on a device +*/ +inline size_t cuda_get_device_max_z_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d), + "failed to query the maximum z-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum x-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_x_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d), + "failed to query the maximum x-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum y-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_y_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d), + "failed to query the maximum y-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum z-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_z_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d), + "failed to query the maximum z-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum shared memory size in bytes per block on a device +*/ +inline size_t cuda_get_device_max_shm_per_block(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d), + "failed to query the maximum shared memory per block on device ", d + ) + return num; +} + +/** +@brief queries the warp size on a device +*/ +inline size_t cuda_get_device_warp_size(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d), + "failed to query the warp size per block on device ", d + ) + return num; +} + +/** +@brief queries the major number of compute capability of a device +*/ +inline int cuda_get_device_compute_capability_major(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d), + "failed to query the major number of compute capability of device ", d + ) + return num; +} + +/** +@brief queries the minor number of compute capability of a device +*/ +inline int cuda_get_device_compute_capability_minor(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d), + "failed to query the minor number of compute capability of device ", d + ) + return num; +} + +/** +@brief queries if the device supports unified addressing +*/ +inline bool cuda_get_device_unified_addressing(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d), + "failed to query unified addressing status on device ", d + ) + return num; +} + +// ---------------------------------------------------------------------------- +// CUDA Version +// ---------------------------------------------------------------------------- + +/** +@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver +*/ +inline int cuda_get_driver_version() { + int num = 0; + TF_CHECK_CUDA( + cudaDriverGetVersion(&num), + "failed to query the latest cuda version supported by the driver" + ); + return num; +} + +/** +@brief queries the CUDA Runtime version (1000 * major + 10 * minor) +*/ +inline int cuda_get_runtime_version() { + int num = 0; + TF_CHECK_CUDA( + cudaRuntimeGetVersion(&num), "failed to query cuda runtime version" + ); + return num; +} + +// ---------------------------------------------------------------------------- +// cudaScopedDevice +// ---------------------------------------------------------------------------- + +/** @class cudaScopedDevice + +@brief class to create an RAII-styled context switch + +Sample usage: + +@code{.cpp} +{ + tf::cudaScopedDevice device(1); // switch to the device context 1 + + // create a stream under device context 1 + cudaStream_t stream; + cudaStreamCreate(&stream); + +} // leaving the scope and goes back to the previous device context +@endcode + +%cudaScopedDevice is neither movable nor copyable. +*/ +class cudaScopedDevice { + + public: + + /** + @brief constructs a RAII-styled device switcher + + @param device device context to scope in the guard + */ + explicit cudaScopedDevice(int device); + + /** + @brief destructs the guard and switches back to the previous device context + */ + ~cudaScopedDevice(); + + private: + + cudaScopedDevice() = delete; + cudaScopedDevice(const cudaScopedDevice&) = delete; + cudaScopedDevice(cudaScopedDevice&&) = delete; + + int _p; +}; + +// Constructor +inline cudaScopedDevice::cudaScopedDevice(int dev) { + TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope"); + if(_p == dev) { + _p = -1; + } + else { + TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev); + } +} + +// Destructor +inline cudaScopedDevice::~cudaScopedDevice() { + if(_p != -1) { + cudaSetDevice(_p); + //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p); + } +} + +} // end of namespace cuda --------------------------------------------------- + + + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_error.hpp b/myxpcs/include/taskflow_/cuda/cuda_error.hpp new file mode 100644 index 0000000..c38e132 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_error.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include <cuda.h> +#include <iostream> +#include <sstream> +#include <exception> + +#include "../utility/stream.hpp" + +#define TF_CUDA_EXPAND( x ) x +#define TF_CUDA_REMOVE_FIRST_HELPER(N, ...) __VA_ARGS__ +#define TF_CUDA_REMOVE_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_REMOVE_FIRST_HELPER(__VA_ARGS__)) +#define TF_CUDA_GET_FIRST_HELPER(N, ...) N +#define TF_CUDA_GET_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_GET_FIRST_HELPER(__VA_ARGS__)) + +#define TF_CHECK_CUDA(...) \ +if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) { \ + std::ostringstream oss; \ + auto __ev__ = TF_CUDA_GET_FIRST(__VA_ARGS__); \ + oss << "[" << __FILE__ << ":" << __LINE__ << "] " \ + << (cudaGetErrorString(__ev__)) << " (" \ + << (cudaGetErrorName(__ev__)) << ") - "; \ + tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__)); \ + throw std::runtime_error(oss.str()); \ +} + diff --git a/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp b/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp new file mode 100644 index 0000000..ae90d98 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_execution_policy.hpp @@ -0,0 +1,155 @@ +#pragma once + +#include "cuda_error.hpp" + +/** +@file cuda_execution_policy.hpp +@brief CUDA execution policy include file +*/ + +namespace tf { + +/** +@class cudaExecutionPolicy + +@brief class to define execution policy for CUDA standard algorithms + +@tparam NT number of threads per block +@tparam VT number of work units per thread + +Execution policy configures the kernel execution parameters in CUDA algorithms. +The first template argument, @c NT, the number of threads per block should +always be a power-of-two number. +The second template argument, @c VT, the number of work units per thread +is recommended to be an odd number to avoid bank conflict. + +Details can be referred to @ref CUDASTDExecutionPolicy. +*/ +template<unsigned NT, unsigned VT> +class cudaExecutionPolicy { + + static_assert(is_pow2(NT), "max # threads per block must be a power of two"); + + public: + + /** @brief static constant for getting the number of threads per block */ + const static unsigned nt = NT; + + /** @brief static constant for getting the number of work units per thread */ + const static unsigned vt = VT; + + /** @brief static constant for getting the number of elements to process per block */ + const static unsigned nv = NT*VT; + + /** + @brief constructs an execution policy object with default stream + */ + cudaExecutionPolicy() = default; + + /** + @brief constructs an execution policy object with the given stream + */ + explicit cudaExecutionPolicy(cudaStream_t s) : _stream{s} {} + + /** + @brief queries the associated stream + */ + cudaStream_t stream() noexcept { return _stream; }; + + /** + @brief assigns a stream + */ + void stream(cudaStream_t stream) noexcept { _stream = stream; } + + /** + @brief queries the number of blocks to accommodate N elements + */ + static unsigned num_blocks(unsigned N) { return (N + nv - 1) / nv; } + + // -------------------------------------------------------------------------- + // Buffer Sizes for Standard Algorithms + // -------------------------------------------------------------------------- + + /** + @brief queries the buffer size in bytes needed to call reduce kernels + + @tparam T value type + + @param count number of elements to reduce + + The function is used to allocate a buffer for calling tf::cuda_reduce, + tf::cuda_uninitialized_reduce, tf::cuda_transform_reduce, and + tf::cuda_uninitialized_transform_reduce. + */ + template <typename T> + static unsigned reduce_bufsz(unsigned count); + + /** + @brief queries the buffer size in bytes needed to call tf::cuda_min_element + + @tparam T value type + + @param count number of elements to search + + The function is used to decide the buffer size in bytes for calling + tf::cuda_min_element. + */ + template <typename T> + static unsigned min_element_bufsz(unsigned count); + + /** + @brief queries the buffer size in bytes needed to call tf::cuda_max_element + + @tparam T value type + + @param count number of elements to search + + The function is used to decide the buffer size in bytes for calling + tf::cuda_max_element. + */ + template <typename T> + static unsigned max_element_bufsz(unsigned count); + + /** + @brief queries the buffer size in bytes needed to call scan kernels + + @tparam T value type + + @param count number of elements to scan + + The function is used to allocate a buffer for calling + tf::cuda_inclusive_scan, tf::cuda_exclusive_scan, + tf::cuda_transform_inclusive_scan, and tf::cuda_transform_exclusive_scan. + */ + template <typename T> + static unsigned scan_bufsz(unsigned count); + + /** + @brief queries the buffer size in bytes needed for CUDA merge algorithms + + @param a_count number of elements in the first vector to merge + @param b_count number of elements in the second vector to merge + + The buffer size of merge algorithm does not depend on the data type. + The buffer is purely used only for storing temporary indices + (of type @c unsigned) required during the merge process. + + The function is used to allocate a buffer for calling + tf::cuda_merge and tf::cuda_merge_by_key. + */ + inline static unsigned merge_bufsz(unsigned a_count, unsigned b_count); + + private: + + cudaStream_t _stream {0}; +}; + +/** +@brief default execution policy + */ +using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>; + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_graph.hpp b/myxpcs/include/taskflow_/cuda/cuda_graph.hpp new file mode 100644 index 0000000..a326aed --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_graph.hpp @@ -0,0 +1,805 @@ +#pragma once + +#include "cuda_memory.hpp" +#include "cuda_stream.hpp" +#include "cuda_meta.hpp" + +#include "../utility/traits.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaGraph_t routines +// ---------------------------------------------------------------------------- + +/** +@brief gets the memcpy node parameter of a copy task +*/ +template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr +> +cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) { + + using U = std::decay_t<T>; + + cudaMemcpy3DParms p; + + p.srcArray = nullptr; + p.srcPos = ::make_cudaPos(0, 0, 0); + p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1); + p.dstArray = nullptr; + p.dstPos = ::make_cudaPos(0, 0, 0); + p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); + p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); + p.kind = cudaMemcpyDefault; + + return p; +} + +/** +@brief gets the memcpy node parameter of a memcpy task (untyped) +*/ +inline cudaMemcpy3DParms cuda_get_memcpy_parms( + void* tgt, const void* src, size_t bytes +) { + + // Parameters in cudaPitchedPtr + // d - Pointer to allocated memory + // p - Pitch of allocated memory in bytes + // xsz - Logical width of allocation in elements + // ysz - Logical height of allocation in elements + cudaMemcpy3DParms p; + p.srcArray = nullptr; + p.srcPos = ::make_cudaPos(0, 0, 0); + p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1); + p.dstArray = nullptr; + p.dstPos = ::make_cudaPos(0, 0, 0); + p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1); + p.extent = ::make_cudaExtent(bytes, 1, 1); + p.kind = cudaMemcpyDefault; + + return p; +} + +/** +@brief gets the memset node parameter of a memcpy task (untyped) +*/ +inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + p.value = ch; + p.pitch = 0; + //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1; + //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count; + p.elementSize = 1; // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief gets the memset node parameter of a fill task (typed) +*/ +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr +> +cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + + // perform bit-wise copy + p.value = 0; // crucial + static_assert(sizeof(T) <= sizeof(p.value), "internal error"); + std::memcpy(&p.value, &value, sizeof(T)); + + p.pitch = 0; + p.elementSize = sizeof(T); // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief gets the memset node parameter of a zero task (typed) +*/ +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr +> +cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + p.value = 0; + p.pitch = 0; + p.elementSize = sizeof(T); // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief queries the number of root nodes in a native CUDA graph +*/ +inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) { + size_t num_nodes; + TF_CHECK_CUDA( + cudaGraphGetRootNodes(graph, nullptr, &num_nodes), + "failed to get native graph root nodes" + ); + return num_nodes; +} + +/** +@brief queries the number of nodes in a native CUDA graph +*/ +inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) { + size_t num_nodes; + TF_CHECK_CUDA( + cudaGraphGetNodes(graph, nullptr, &num_nodes), + "failed to get native graph nodes" + ); + return num_nodes; +} + +/** +@brief queries the number of edges in a native CUDA graph +*/ +inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) { + size_t num_edges; + TF_CHECK_CUDA( + cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges), + "failed to get native graph edges" + ); + return num_edges; +} + +/** +@brief acquires the nodes in a native CUDA graph +*/ +inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) { + size_t num_nodes = cuda_graph_get_num_nodes(graph); + std::vector<cudaGraphNode_t> nodes(num_nodes); + TF_CHECK_CUDA( + cudaGraphGetNodes(graph, nodes.data(), &num_nodes), + "failed to get native graph nodes" + ); + return nodes; +} + +/** +@brief acquires the root nodes in a native CUDA graph +*/ +inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) { + size_t num_nodes = cuda_graph_get_num_root_nodes(graph); + std::vector<cudaGraphNode_t> nodes(num_nodes); + TF_CHECK_CUDA( + cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes), + "failed to get native graph nodes" + ); + return nodes; +} + +/** +@brief acquires the edges in a native CUDA graph +*/ +inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> +cuda_graph_get_edges(cudaGraph_t graph) { + size_t num_edges = cuda_graph_get_num_edges(graph); + std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges); + TF_CHECK_CUDA( + cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges), + "failed to get native graph edges" + ); + std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges); + for(size_t i=0; i<num_edges; i++) { + edges[i] = std::make_pair(froms[i], tos[i]); + } + return edges; +} + +/** +@brief queries the type of a native CUDA graph node + +valid type values are: + + cudaGraphNodeTypeKernel = 0x00 + + cudaGraphNodeTypeMemcpy = 0x01 + + cudaGraphNodeTypeMemset = 0x02 + + cudaGraphNodeTypeHost = 0x03 + + cudaGraphNodeTypeGraph = 0x04 + + cudaGraphNodeTypeEmpty = 0x05 + + cudaGraphNodeTypeWaitEvent = 0x06 + + cudaGraphNodeTypeEventRecord = 0x07 +*/ +inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) { + cudaGraphNodeType type; + TF_CHECK_CUDA( + cudaGraphNodeGetType(node, &type), "failed to get native graph node type" + ); + return type; +} + +/** +@brief convert the type of a native CUDA graph node to a readable string +*/ +inline const char* cuda_graph_node_type_to_string(cudaGraphNodeType type) { + switch(type) { + case cudaGraphNodeTypeKernel : return "kernel"; + case cudaGraphNodeTypeMemcpy : return "memcpy"; + case cudaGraphNodeTypeMemset : return "memset"; + case cudaGraphNodeTypeHost : return "host"; + case cudaGraphNodeTypeGraph : return "graph"; + case cudaGraphNodeTypeEmpty : return "empty"; + case cudaGraphNodeTypeWaitEvent : return "event_wait"; + case cudaGraphNodeTypeEventRecord : return "event_record"; + default : return "undefined"; + } +} + +/** +@brief dumps a native CUDA graph and all associated child graphs to a DOT format + +@tparam T output stream target +@param os target output stream +@param graph native CUDA graph +*/ +template <typename T> +void cuda_dump_graph(T& os, cudaGraph_t g) { + + os << "digraph cudaGraph {\n"; + + std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack; + stack.push(std::make_tuple(g, nullptr, 1)); + + int pl = 0; + + while(stack.empty() == false) { + + auto [graph, parent, l] = stack.top(); + stack.pop(); + + for(int i=0; i<pl-l+1; i++) { + os << "}\n"; + } + + os << "subgraph cluster_p" << graph << " {\n" + << "label=\"cudaGraph-L" << l << "\";\n" + << "color=\"purple\";\n"; + + auto nodes = cuda_graph_get_nodes(graph); + auto edges = cuda_graph_get_edges(graph); + + for(auto& [from, to] : edges) { + os << 'p' << from << " -> " << 'p' << to << ";\n"; + } + + for(auto& node : nodes) { + auto type = cuda_get_graph_node_type(node); + if(type == cudaGraphNodeTypeGraph) { + + cudaGraph_t child_graph; + TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), ""); + stack.push(std::make_tuple(child_graph, node, l+1)); + + os << 'p' << node << "[" + << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " + << "label=\"cudaGraph-L" << l+1 + << "\"];\n"; + } + else { + os << 'p' << node << "[label=\"" + << cuda_graph_node_type_to_string(type) + << "\"];\n"; + } + } + + // precede to parent + if(parent != nullptr) { + std::unordered_set<cudaGraphNode_t> successors; + for(const auto& p : edges) { + successors.insert(p.first); + } + for(auto node : nodes) { + if(successors.find(node) == successors.end()) { + os << 'p' << node << " -> " << 'p' << parent << ";\n"; + } + } + } + + // set the previous level + pl = l; + } + + for(int i=0; i<=pl; i++) { + os << "}\n"; + } +} + +// ---------------------------------------------------------------------------- +// cudaGraph +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct cudaGraphCreator { + cudaGraph_t operator () () const { + cudaGraph_t g; + TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph"); + return g; + } +}; + +/** +@private +*/ +struct cudaGraphDeleter { + void operator () (cudaGraph_t g) const { + if(g) { + cudaGraphDestroy(g); + } + } +}; + +/** +@class cudaGraph + +@brief class to create an RAII-styled wrapper over a CUDA executable graph + +A cudaGraph object is an RAII-styled wrapper over +a native CUDA graph (@c cudaGraph_t). +A cudaGraph object is move-only. +*/ +class cudaGraph : + public cudaObject<cudaGraph_t, cudaGraphCreator, cudaGraphDeleter> { + + public: + + /** + @brief constructs an RAII-styled object from the given CUDA exec + + Constructs a cudaGraph object from the given CUDA graph @c native. + */ + explicit cudaGraph(cudaGraph_t native) : cudaObject(native) { } + + /** + @brief constructs a cudaGraph object with a new CUDA graph + */ + cudaGraph() = default; +}; + +// ---------------------------------------------------------------------------- +// cudaGraphExec +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct cudaGraphExecCreator { + cudaGraphExec_t operator () () const { return nullptr; } +}; + +/** +@private +*/ +struct cudaGraphExecDeleter { + void operator () (cudaGraphExec_t executable) const { + if(executable) { + cudaGraphExecDestroy(executable); + } + } +}; + +/** +@class cudaGraphExec + +@brief class to create an RAII-styled wrapper over a CUDA executable graph + +A cudaGraphExec object is an RAII-styled wrapper over +a native CUDA executable graph (@c cudaGraphExec_t). +A cudaGraphExec object is move-only. +*/ +class cudaGraphExec : + public cudaObject<cudaGraphExec_t, cudaGraphExecCreator, cudaGraphExecDeleter> { + + public: + + /** + @brief constructs an RAII-styled object from the given CUDA exec + + Constructs a cudaGraphExec object which owns @c exec. + */ + explicit cudaGraphExec(cudaGraphExec_t exec) : cudaObject(exec) { } + + /** + @brief default constructor + */ + cudaGraphExec() = default; + + /** + @brief instantiates the exexutable from the given CUDA graph + */ + void instantiate(cudaGraph_t graph) { + cudaGraphExecDeleter {} (object); + TF_CHECK_CUDA( + cudaGraphInstantiate(&object, graph, nullptr, nullptr, 0), + "failed to create an executable graph" + ); + } + + /** + @brief updates the exexutable from the given CUDA graph + */ + cudaGraphExecUpdateResult update(cudaGraph_t graph) { + cudaGraphNode_t error_node; + cudaGraphExecUpdateResult error_result; + cudaGraphExecUpdate(object, graph, &error_node, &error_result); + return error_result; + } + + /** + @brief launchs the executable graph via the given stream + */ + void launch(cudaStream_t stream) { + TF_CHECK_CUDA( + cudaGraphLaunch(object, stream), "failed to launch a CUDA executable graph" + ); + } +}; + +// ---------------------------------------------------------------------------- +// cudaFlowGraph class +// ---------------------------------------------------------------------------- + +// class: cudaFlowGraph +class cudaFlowGraph { + + friend class cudaFlowNode; + friend class cudaTask; + friend class cudaFlowCapturer; + friend class cudaFlow; + friend class cudaFlowOptimizerBase; + friend class cudaFlowSequentialOptimizer; + friend class cudaFlowLinearOptimizer; + friend class cudaFlowRoundRobinOptimizer; + friend class Taskflow; + friend class Executor; + + constexpr static int OFFLOADED = 0x01; + constexpr static int CHANGED = 0x02; + constexpr static int UPDATED = 0x04; + + public: + + cudaFlowGraph() = default; + ~cudaFlowGraph() = default; + + cudaFlowGraph(const cudaFlowGraph&) = delete; + cudaFlowGraph(cudaFlowGraph&&) = default; + + cudaFlowGraph& operator = (const cudaFlowGraph&) = delete; + cudaFlowGraph& operator = (cudaFlowGraph&&) = default; + + template <typename... ArgsT> + cudaFlowNode* emplace_back(ArgsT&&...); + + bool empty() const; + + void clear(); + void dump(std::ostream&, const void*, const std::string&) const ; + + private: + + int _state{CHANGED}; + cudaGraph _native_handle {nullptr}; + std::vector<std::unique_ptr<cudaFlowNode>> _nodes; +}; + +// ---------------------------------------------------------------------------- +// cudaFlowNode class +// ---------------------------------------------------------------------------- + +/** +@private +@class: cudaFlowNode +*/ +class cudaFlowNode { + + friend class cudaFlowGraph; + friend class cudaTask; + friend class cudaFlow; + friend class cudaFlowCapturer; + friend class cudaFlowOptimizerBase; + friend class cudaFlowSequentialOptimizer; + friend class cudaFlowLinearOptimizer; + friend class cudaFlowRoundRobinOptimizer; + friend class Taskflow; + friend class Executor; + + // Empty handle + struct Empty { + }; + + // Host handle + struct Host { + + template <typename C> + Host(C&&); + + std::function<void()> func; + + static void callback(void*); + }; + + // Memset handle + struct Memset { + }; + + // Memcpy handle + struct Memcpy { + }; + + // Kernel handle + struct Kernel { + + template <typename F> + Kernel(F&& f); + + void* func {nullptr}; + }; + + // Subflow handle + struct Subflow { + cudaFlowGraph cfg; + }; + + // Capture + struct Capture { + + template <typename C> + Capture(C&&); + + std::function<void(cudaStream_t)> work; + + cudaEvent_t event; + size_t level; + size_t lid; + size_t idx; + }; + + using handle_t = std::variant< + Empty, + Host, + Memset, + Memcpy, + Kernel, + Subflow, + Capture + >; + + public: + + // variant index + constexpr static auto EMPTY = get_index_v<Empty, handle_t>; + constexpr static auto HOST = get_index_v<Host, handle_t>; + constexpr static auto MEMSET = get_index_v<Memset, handle_t>; + constexpr static auto MEMCPY = get_index_v<Memcpy, handle_t>; + constexpr static auto KERNEL = get_index_v<Kernel, handle_t>; + constexpr static auto SUBFLOW = get_index_v<Subflow, handle_t>; + constexpr static auto CAPTURE = get_index_v<Capture, handle_t>; + + cudaFlowNode() = delete; + + template <typename... ArgsT> + cudaFlowNode(cudaFlowGraph&, ArgsT&&...); + + private: + + cudaFlowGraph& _cfg; + + std::string _name; + + handle_t _handle; + + cudaGraphNode_t _native_handle {nullptr}; + + SmallVector<cudaFlowNode*> _successors; + SmallVector<cudaFlowNode*> _dependents; + + void _precede(cudaFlowNode*); +}; + +// ---------------------------------------------------------------------------- +// cudaFlowNode definitions +// ---------------------------------------------------------------------------- + +// Host handle constructor +template <typename C> +cudaFlowNode::Host::Host(C&& c) : func {std::forward<C>(c)} { +} + +// Host callback +inline void cudaFlowNode::Host::callback(void* data) { + static_cast<Host*>(data)->func(); +}; + +// Kernel handle constructor +template <typename F> +cudaFlowNode::Kernel::Kernel(F&& f) : + func {std::forward<F>(f)} { +} + +// Capture handle constructor +template <typename C> +cudaFlowNode::Capture::Capture(C&& c) : + work {std::forward<C>(c)} { +} + +// Constructor +template <typename... ArgsT> +cudaFlowNode::cudaFlowNode(cudaFlowGraph& graph, ArgsT&&... args) : + _cfg {graph}, + _handle {std::forward<ArgsT>(args)...} { +} + +// Procedure: _precede +inline void cudaFlowNode::_precede(cudaFlowNode* v) { + + _cfg._state |= cudaFlowGraph::CHANGED; + + _successors.push_back(v); + v->_dependents.push_back(this); + + // capture node doesn't have the native graph yet + if(_handle.index() != cudaFlowNode::CAPTURE) { + TF_CHECK_CUDA( + cudaGraphAddDependencies( + _cfg._native_handle, &_native_handle, &v->_native_handle, 1 + ), + "failed to add a preceding link ", this, "->", v + ); + } +} + +// ---------------------------------------------------------------------------- +// cudaGraph definitions +// ---------------------------------------------------------------------------- + +// Function: empty +inline bool cudaFlowGraph::empty() const { + return _nodes.empty(); +} + +// Procedure: clear +inline void cudaFlowGraph::clear() { + _state |= cudaFlowGraph::CHANGED; + _nodes.clear(); + _native_handle.clear(); +} + +// Function: emplace_back +template <typename... ArgsT> +cudaFlowNode* cudaFlowGraph::emplace_back(ArgsT&&... args) { + + _state |= cudaFlowGraph::CHANGED; + + auto node = std::make_unique<cudaFlowNode>(std::forward<ArgsT>(args)...); + _nodes.emplace_back(std::move(node)); + return _nodes.back().get(); + + // TODO: use object pool to save memory + //auto node = new cudaFlowNode(std::forward<ArgsT>(args)...); + //_nodes.push_back(node); + //return node; +} + +// Procedure: dump the graph to a DOT format +inline void cudaFlowGraph::dump( + std::ostream& os, const void* root, const std::string& root_name +) const { + + // recursive dump with stack + std::stack<std::tuple<const cudaFlowGraph*, const cudaFlowNode*, int>> stack; + stack.push(std::make_tuple(this, nullptr, 1)); + + int pl = 0; + + while(!stack.empty()) { + + auto [graph, parent, l] = stack.top(); + stack.pop(); + + for(int i=0; i<pl-l+1; i++) { + os << "}\n"; + } + + if(parent == nullptr) { + if(root) { + os << "subgraph cluster_p" << root << " {\nlabel=\"cudaFlow: "; + if(root_name.empty()) os << 'p' << root; + else os << root_name; + os << "\";\n" << "color=\"purple\"\n"; + } + else { + os << "digraph cudaFlow {\n"; + } + } + else { + os << "subgraph cluster_p" << parent << " {\nlabel=\"cudaSubflow: "; + if(parent->_name.empty()) os << 'p' << parent; + else os << parent->_name; + os << "\";\n" << "color=\"purple\"\n"; + } + + for(auto& node : graph->_nodes) { + + auto v = node.get(); + + os << 'p' << v << "[label=\""; + if(v->_name.empty()) { + os << 'p' << v << "\""; + } + else { + os << v->_name << "\""; + } + + switch(v->_handle.index()) { + case cudaFlowNode::KERNEL: + os << " style=\"filled\"" + << " color=\"white\" fillcolor=\"black\"" + << " fontcolor=\"white\"" + << " shape=\"box3d\""; + break; + + case cudaFlowNode::SUBFLOW: + stack.push(std::make_tuple( + &(std::get_if<cudaFlowNode::Subflow>(&v->_handle)->cfg), v, l+1) + ); + os << " style=\"filled\"" + << " color=\"black\" fillcolor=\"purple\"" + << " fontcolor=\"white\"" + << " shape=\"folder\""; + break; + + default: + break; + } + + os << "];\n"; + + for(const auto s : v->_successors) { + os << 'p' << v << " -> " << 'p' << s << ";\n"; + } + + if(v->_successors.size() == 0) { + if(parent == nullptr) { + if(root) { + os << 'p' << v << " -> p" << root << ";\n"; + } + } + else { + os << 'p' << v << " -> p" << parent << ";\n"; + } + } + } + + // set the previous level + pl = l; + } + + for(int i=0; i<pl; i++) { + os << "}\n"; + } + +} + + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_memory.hpp b/myxpcs/include/taskflow_/cuda/cuda_memory.hpp new file mode 100644 index 0000000..0740d49 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_memory.hpp @@ -0,0 +1,855 @@ +#pragma once + +#include "cuda_device.hpp" + +/** +@file cuda_memory.hpp +@brief CUDA memory utilities include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// memory +// ---------------------------------------------------------------------------- + +/** +@brief queries the free memory (expensive call) +*/ +inline size_t cuda_get_free_mem(int d) { + cudaScopedDevice ctx(d); + size_t free, total; + TF_CHECK_CUDA( + cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d + ); + return free; +} + +/** +@brief queries the total available memory (expensive call) +*/ +inline size_t cuda_get_total_mem(int d) { + cudaScopedDevice ctx(d); + size_t free, total; + TF_CHECK_CUDA( + cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d + ); + return total; +} + +/** +@brief allocates memory on the given device for holding @c N elements of type @c T + +The function calls @c cudaMalloc to allocate <tt>N*sizeof(T)</tt> bytes of memory +on the given device @c d and returns a pointer to the starting address of +the device memory. +*/ +template <typename T> +T* cuda_malloc_device(size_t N, int d) { + cudaScopedDevice ctx(d); + T* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMalloc(&ptr, N*sizeof(T)), + "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d + ) + return ptr; +} + +/** +@brief allocates memory on the current device associated with the caller + +The function calls malloc_device from the current device associated +with the caller. +*/ +template <typename T> +T* cuda_malloc_device(size_t N) { + T* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMalloc(&ptr, N*sizeof(T)), + "failed to allocate memory (", N*sizeof(T), "bytes)" + ) + return ptr; +} + +/** +@brief allocates shared memory for holding @c N elements of type @c T + +The function calls @c cudaMallocManaged to allocate <tt>N*sizeof(T)</tt> bytes +of memory and returns a pointer to the starting address of the shared memory. +*/ +template <typename T> +T* cuda_malloc_shared(size_t N) { + T* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMallocManaged(&ptr, N*sizeof(T)), + "failed to allocate shared memory (", N*sizeof(T), "bytes)" + ) + return ptr; +} + +/** +@brief frees memory on the GPU device + +@tparam T pointer type +@param ptr device pointer to memory to free +@param d device context identifier + +This methods call @c cudaFree to free the memory space pointed to by @c ptr +using the given device context. +*/ +template <typename T> +void cuda_free(T* ptr, int d) { + cudaScopedDevice ctx(d); + TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d); +} + +/** +@brief frees memory on the GPU device + +@tparam T pointer type +@param ptr device pointer to memory to free + +This methods call @c cudaFree to free the memory space pointed to by @c ptr +using the current device context of the caller. +*/ +template <typename T> +void cuda_free(T* ptr) { + TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr); +} + +/** +@brief copies data between host and device asynchronously through a stream + +@param stream stream identifier +@param dst destination memory address +@param src source memory address +@param count size in bytes to copy + +The method calls @c cudaMemcpyAsync with the given @c stream +using @c cudaMemcpyDefault to infer the memory space of the source and +the destination pointers. The memory areas may not overlap. +*/ +inline void cuda_memcpy_async( + cudaStream_t stream, void* dst, const void* src, size_t count +) { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to perform cudaMemcpyAsync" + ); +} + +/** +@brief initializes or sets GPU memory to the given value byte by byte + +@param stream stream identifier +@param devPtr pointer to GPU mempry +@param value value to set for each byte of the specified memory +@param count size in bytes to set + +The method calls @c cudaMemsetAsync with the given @c stream +to fill the first @c count bytes of the memory area pointed to by @c devPtr +with the constant byte value @c value. +*/ +inline void cuda_memset_async( + cudaStream_t stream, void* devPtr, int value, size_t count +){ + TF_CHECK_CUDA( + cudaMemsetAsync(devPtr, value, count, stream), + "failed to perform cudaMemsetAsync" + ); +} + +// ---------------------------------------------------------------------------- +// Shared Memory +// ---------------------------------------------------------------------------- +// +// Because dynamically sized shared memory arrays are declared "extern", +// we can't templatize them directly. To get around this, we declare a +// simple wrapper struct that will declare the extern array with a different +// name depending on the type. This avoids compiler errors about duplicate +// definitions. +// +// To use dynamically allocated shared memory in a templatized __global__ or +// __device__ function, just replace code like this: +// +// template<class T> +// __global__ void +// foo( T* g_idata, T* g_odata) +// { +// // Shared mem size is determined by the host app at run time +// extern __shared__ T sdata[]; +// ... +// doStuff(sdata); +// ... +// } +// +// With this: +// +// template<class T> +// __global__ void +// foo( T* g_idata, T* g_odata) +// { +// // Shared mem size is determined by the host app at run time +// cudaSharedMemory<T> smem; +// T* sdata = smem.get(); +// ... +// doStuff(sdata); +// ... +// } +// ---------------------------------------------------------------------------- + +// This is the un-specialized struct. Note that we prevent instantiation of this +// struct by putting an undefined symbol in the function body so it won't compile. +/** +@private +*/ +template <typename T> +struct cudaSharedMemory +{ + // Ensure that we won't compile any un-specialized types + __device__ T *get() + { + extern __device__ void error(void); + error(); + return NULL; + } +}; + +// Following are the specializations for the following types. +// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double +// One could also specialize it for user-defined types. + +/** +@private +*/ +template <> +struct cudaSharedMemory <int> +{ + __device__ int *get() + { + extern __shared__ int s_int[]; + return s_int; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <unsigned int> +{ + __device__ unsigned int *get() + { + extern __shared__ unsigned int s_uint[]; + return s_uint; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <char> +{ + __device__ char *get() + { + extern __shared__ char s_char[]; + return s_char; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <unsigned char> +{ + __device__ unsigned char *get() + { + extern __shared__ unsigned char s_uchar[]; + return s_uchar; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <short> +{ + __device__ short *get() + { + extern __shared__ short s_short[]; + return s_short; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <unsigned short> +{ + __device__ unsigned short *get() + { + extern __shared__ unsigned short s_ushort[]; + return s_ushort; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <long> +{ + __device__ long *get() + { + extern __shared__ long s_long[]; + return s_long; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <unsigned long> +{ + __device__ unsigned long *get() + { + extern __shared__ unsigned long s_ulong[]; + return s_ulong; + } +}; + +//template <> +//struct cudaSharedMemory <size_t> +//{ +// __device__ size_t *get() +// { +// extern __shared__ size_t s_sizet[]; +// return s_sizet; +// } +//}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <bool> +{ + __device__ bool *get() + { + extern __shared__ bool s_bool[]; + return s_bool; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <float> +{ + __device__ float *get() + { + extern __shared__ float s_float[]; + return s_float; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory <double> +{ + __device__ double *get() + { + extern __shared__ double s_double[]; + return s_double; + } +}; + + + +// ---------------------------------------------------------------------------- +// cudaDeviceAllocator +// ---------------------------------------------------------------------------- + +/** +@class cudaDeviceAllocator + +@brief class to create a CUDA device allocator + +@tparam T element type + +A %cudaDeviceAllocator enables device-specific allocation for +standard library containers. It is typically passed as template parameter +when declaring standard library containers (e.g. std::vector). +*/ +template<typename T> +class cudaDeviceAllocator { + + public: + + /** + @brief element type + */ + using value_type = T; + + /** + @brief element pointer type + */ + using pointer = T*; + + /** + @brief element reference type + */ + using reference = T&; + + /** + @brief const element pointer type + */ + using const_pointer = const T*; + + /** + @brief constant element reference type + */ + using const_reference = const T&; + + /** + @brief size type + */ + using size_type = std::size_t; + + /** + @brief pointer difference type + */ + using difference_type = std::ptrdiff_t; + + /** + @brief its member type @c U is the equivalent allocator type to allocate elements of type U + */ + template<typename U> + struct rebind { + /** + @brief allocator of a different data type + */ + using other = cudaDeviceAllocator<U>; + }; + + /** + @brief Constructs a device allocator object. + */ + cudaDeviceAllocator() noexcept {} + + /** + @brief Constructs a device allocator object from another device allocator object. + */ + cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {} + + /** + @brief Constructs a device allocator object from another device allocator + object with a different element type. + */ + template<typename U> + cudaDeviceAllocator( const cudaDeviceAllocator<U>& ) noexcept {} + + /** + @brief Destructs the device allocator object. + */ + ~cudaDeviceAllocator() noexcept {} + + /** + @brief Returns the address of x. + + This effectively means returning &x. + + @param x reference to an object + @return a pointer to the object + */ + pointer address( reference x ) { return &x; } + + /** + @brief Returns the address of x. + + This effectively means returning &x. + + @param x reference to an object + @return a pointer to the object + */ + const_pointer address( const_reference x ) const { return &x; } + + /** + @brief allocates block of storage. + + Attempts to allocate a block of storage with a size large enough to contain + @c n elements of member type, @c value_type, and returns a pointer + to the first element. + + The storage is aligned appropriately for object of type @c value_type, + but they are not constructed. + + The block of storage is allocated using cudaMalloc and throws std::bad_alloc + if it cannot allocate the total amount of storage requested. + + @param n number of elements (each of size sizeof(value_type)) to be allocated + @return a pointer to the initial element in the block of storage. + */ + pointer allocate( size_type n, std::allocator<void>::const_pointer = 0 ) + { + void* ptr = NULL; + TF_CHECK_CUDA( + cudaMalloc( &ptr, n*sizeof(T) ), + "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)" + ) + return static_cast<pointer>(ptr); + } + + /** + @brief Releases a block of storage previously allocated with member allocate and not yet released + + The elements in the array are not destroyed by a call to this member function. + + @param ptr pointer to a block of storage previously allocated with allocate + */ + void deallocate( pointer ptr, size_type ) + { + if(ptr){ + cudaFree(ptr); + } + } + + /** + @brief returns the maximum number of elements that could potentially + be allocated by this allocator + + A call to member allocate with the value returned by this function + can still fail to allocate the requested storage. + + @return the nubmer of elements that might be allcoated as maximum + by a call to member allocate + */ + size_type max_size() const noexcept { return size_type {-1}; } + + /** + @brief ignored to avoid de-referencing device pointer from the host + */ + void construct( pointer, const_reference) { } + + /** + @brief ignored to avoid de-referencing device pointer from the host + */ + void destroy( pointer) { } + + /** + @brief compares two allocator of different types using @c == + + Device allocators of different types are always equal to each other + because the storage allocated by the allocator @c a1 can be deallocated + through @c a2. + */ + template <typename U> + bool operator == (const cudaDeviceAllocator<U>&) const noexcept { + return true; + } + + /** + @brief compares two allocator of different types using @c != + + Device allocators of different types are always equal to each other + because the storage allocated by the allocator @c a1 can be deallocated + through @c a2. + */ + template <typename U> + bool operator != (const cudaDeviceAllocator<U>&) const noexcept { + return false; + } + +}; + +// ---------------------------------------------------------------------------- +// cudaUSMAllocator +// ---------------------------------------------------------------------------- + +/** +@class cudaUSMAllocator + +@brief class to create a unified shared memory (USM) allocator + +@tparam T element type + +A %cudaUSMAllocator enables using unified shared memory (USM) allocation for +standard library containers. It is typically passed as template parameter +when declaring standard library containers (e.g. std::vector). +*/ +template<typename T> +class cudaUSMAllocator { + + public: + + /** + @brief element type + */ + using value_type = T; + + /** + @brief element pointer type + */ + using pointer = T*; + + /** + @brief element reference type + */ + using reference = T&; + + /** + @brief const element pointer type + */ + using const_pointer = const T*; + + /** + @brief constant element reference type + */ + using const_reference = const T&; + + /** + @brief size type + */ + using size_type = std::size_t; + + /** + @brief pointer difference type + */ + using difference_type = std::ptrdiff_t; + + /** + @brief its member type @c U is the equivalent allocator type to allocate elements of type U + */ + template<typename U> + struct rebind { + /** + @brief allocator of a different data type + */ + using other = cudaUSMAllocator<U>; + }; + + /** + @brief Constructs a device allocator object. + */ + cudaUSMAllocator() noexcept {} + + /** + @brief Constructs a device allocator object from another device allocator object. + */ + cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {} + + /** + @brief Constructs a device allocator object from another device allocator + object with a different element type. + */ + template<typename U> + cudaUSMAllocator( const cudaUSMAllocator<U>& ) noexcept {} + + /** + @brief Destructs the device allocator object. + */ + ~cudaUSMAllocator() noexcept {} + + /** + @brief Returns the address of x. + + This effectively means returning &x. + + @param x reference to an object + @return a pointer to the object + */ + pointer address( reference x ) { return &x; } + + /** + @brief Returns the address of x. + + This effectively means returning &x. + + @param x reference to an object + @return a pointer to the object + */ + const_pointer address( const_reference x ) const { return &x; } + + /** + @brief allocates block of storage. + + Attempts to allocate a block of storage with a size large enough to contain + @c n elements of member type, @c value_type, and returns a pointer + to the first element. + + The storage is aligned appropriately for object of type @c value_type, + but they are not constructed. + + The block of storage is allocated using cudaMalloc and throws std::bad_alloc + if it cannot allocate the total amount of storage requested. + + @param n number of elements (each of size sizeof(value_type)) to be allocated + @return a pointer to the initial element in the block of storage. + */ + pointer allocate( size_type n, std::allocator<void>::const_pointer = 0 ) + { + void* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMallocManaged( &ptr, n*sizeof(T) ), + "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)" + ) + return static_cast<pointer>(ptr); + } + + /** + @brief Releases a block of storage previously allocated with member allocate and not yet released + + The elements in the array are not destroyed by a call to this member function. + + @param ptr pointer to a block of storage previously allocated with allocate + */ + void deallocate( pointer ptr, size_type ) + { + if(ptr){ + cudaFree(ptr); + } + } + + /** + @brief returns the maximum number of elements that could potentially + be allocated by this allocator + + A call to member allocate with the value returned by this function + can still fail to allocate the requested storage. + + @return the nubmer of elements that might be allcoated as maximum + by a call to member allocate + */ + size_type max_size() const noexcept { return size_type {-1}; } + + /** + @brief Constructs an element object on the location pointed by ptr. + @param ptr pointer to a location with enough storage soace to contain + an element of type @c value_type + + @param val value to initialize the constructed element to + */ + void construct( pointer ptr, const_reference val ) { + new ((void*)ptr) value_type(val); + } + + /** + @brief destroys in-place the object pointed by @c ptr + + Notice that this does not deallocate the storage for the element but calls + its destructor. + + @param ptr pointer to the object to be destroye + */ + void destroy( pointer ptr ) { + ptr->~value_type(); + } + + /** + @brief compares two allocator of different types using @c == + + USM allocators of different types are always equal to each other + because the storage allocated by the allocator @c a1 can be deallocated + through @c a2. + */ + template <typename U> + bool operator == (const cudaUSMAllocator<U>&) const noexcept { + return true; + } + + /** + @brief compares two allocator of different types using @c != + + USM allocators of different types are always equal to each other + because the storage allocated by the allocator @c a1 can be deallocated + through @c a2. + */ + template <typename U> + bool operator != (const cudaUSMAllocator<U>&) const noexcept { + return false; + } + +}; + +// ---------------------------------------------------------------------------- +// GPU vector object +// ---------------------------------------------------------------------------- + +//template <typename T> +//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>; + +//template <typename T> +//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>; + +/** +@private +*/ +template <typename T> +class cudaDeviceVector { + + public: + + cudaDeviceVector() = default; + + cudaDeviceVector(size_t N) : _N {N} { + if(N) { + TF_CHECK_CUDA( + cudaMalloc(&_data, N*sizeof(T)), + "failed to allocate device memory (", N*sizeof(T), " bytes)" + ); + } + } + + cudaDeviceVector(cudaDeviceVector&& rhs) : + _data{rhs._data}, _N {rhs._N} { + rhs._data = nullptr; + rhs._N = 0; + } + + ~cudaDeviceVector() { + if(_data) { + cudaFree(_data); + } + } + + cudaDeviceVector& operator = (cudaDeviceVector&& rhs) { + if(_data) { + cudaFree(_data); + } + _data = rhs._data; + _N = rhs._N; + rhs._data = nullptr; + rhs._N = 0; + return *this; + } + + size_t size() const { return _N; } + + T* data() { return _data; } + const T* data() const { return _data; } + + cudaDeviceVector(const cudaDeviceVector&) = delete; + cudaDeviceVector& operator = (const cudaDeviceVector&) = delete; + + private: + + T* _data {nullptr}; + size_t _N {0}; +}; + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_meta.hpp b/myxpcs/include/taskflow_/cuda/cuda_meta.hpp new file mode 100644 index 0000000..b08eb29 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_meta.hpp @@ -0,0 +1,452 @@ +#pragma once + +#include "cuda_execution_policy.hpp" + +namespace tf { + +// default warp size +inline constexpr unsigned CUDA_WARP_SIZE = 32; + +// empty type +struct cudaEmpty { }; + +// ---------------------------------------------------------------------------- +// iterator unrolling +// ---------------------------------------------------------------------------- + +// Template unrolled looping construct. +template<unsigned i, unsigned count, bool valid = (i < count)> +struct cudaIterate { + template<typename F> + __device__ static void eval(F f) { + f(i); + cudaIterate<i + 1, count>::eval(f); + } +}; + +template<unsigned i, unsigned count> +struct cudaIterate<i, count, false> { + template<typename F> + __device__ static void eval(F) { } +}; + +template<unsigned begin, unsigned end, typename F> +__device__ void cuda_iterate(F f) { + cudaIterate<begin, end>::eval(f); +} + +template<unsigned count, typename F> +__device__ void cuda_iterate(F f) { + cuda_iterate<0, count>(f); +} + +template<unsigned count, typename T> +__device__ T reduce(const T(&x)[count]) { + T y; + cuda_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; }); + return y; +} + +template<unsigned count, typename T> +__device__ void fill(T(&x)[count], T val) { + cuda_iterate<count>([&](auto i) { x[i] = val; }); +} + +// Invoke unconditionally. +template<unsigned nt, unsigned vt, typename F> +__device__ void cuda_strided_iterate(F f, unsigned tid) { + cuda_iterate<vt>([=](auto i) { f(i, nt * i + tid); }); +} + +// Check range. +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F> +__device__ void cuda_strided_iterate(F f, unsigned tid, unsigned count) { + // Unroll the first vt0 elements of each thread. + if(vt0 > 1 && count >= nt * vt0) { + cuda_strided_iterate<nt, vt0>(f, tid); // No checking + } else { + cuda_iterate<vt0>([=](auto i) { + auto j = nt * i + tid; + if(j < count) f(i, j); + }); + } + + // TODO: seems dummy when vt0 == vt + cuda_iterate<vt0, vt>([=](auto i) { + auto j = nt * i + tid; + if(j < count) f(i, j); + }); +} + +template<unsigned vt, typename F> +__device__ void cuda_thread_iterate(F f, unsigned tid) { + cuda_iterate<vt>([=](auto i) { f(i, vt * tid + i); }); +} + +// ---------------------------------------------------------------------------- +// cudaRange +// ---------------------------------------------------------------------------- + +// cudaRange +struct cudaRange { + unsigned begin, end; + __device__ unsigned size() const { return end - begin; } + __device__ unsigned count() const { return size(); } + __device__ bool valid() const { return end > begin; } +}; + +inline __device__ cudaRange cuda_get_tile(unsigned b, unsigned nv, unsigned count) { + return cudaRange { nv * b, min(count, nv * (b + 1)) }; +} + + +// ---------------------------------------------------------------------------- +// cudaArray +// ---------------------------------------------------------------------------- + +template<typename T, unsigned size> +struct cudaArray { + T data[size]; + + __device__ T operator[](unsigned i) const { return data[i]; } + __device__ T& operator[](unsigned i) { return data[i]; } + + cudaArray() = default; + cudaArray(const cudaArray&) = default; + cudaArray& operator=(const cudaArray&) = default; + + // Fill the array with x. + __device__ cudaArray(T x) { + cuda_iterate<size>([&](unsigned i) { data[i] = x; }); + } +}; + +template<typename T> +struct cudaArray<T, 0> { + __device__ T operator[](unsigned) const { return T(); } + __device__ T& operator[](unsigned) { return *(T*)nullptr; } +}; + +template<typename T, typename V, unsigned size> +struct cudaKVArray { + cudaArray<T, size> keys; + cudaArray<V, size> vals; +}; + +// ---------------------------------------------------------------------------- +// thread reg <-> global mem +// ---------------------------------------------------------------------------- + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I> +__device__ auto cuda_mem_to_reg_strided(I mem, unsigned tid, unsigned count) { + using T = typename std::iterator_traits<I>::value_type; + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt, vt0>( + [&](auto i, auto j) { x[i] = mem[j]; }, tid, count + ); + return x; +} + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t> +__device__ void cuda_reg_to_mem_strided( + cudaArray<T, vt> x, unsigned tid, unsigned count, it_t mem) { + + cuda_strided_iterate<nt, vt, vt0>( + [=](auto i, auto j) { mem[j] = x[i]; }, tid, count + ); +} + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O> +__device__ auto cuda_transform_mem_to_reg_strided( + I mem, unsigned tid, unsigned count, O op +) { + using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>; + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt, vt0>( + [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count + ); + return x; +} + +// ---------------------------------------------------------------------------- +// thread reg <-> shared +// ---------------------------------------------------------------------------- + +template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +__device__ void cuda_reg_to_shared_thread( + cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true +) { + + static_assert(shared_size >= nt * vt, + "reg_to_shared_thread must have at least nt * vt storage"); + + cuda_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid); + + if(sync) __syncthreads(); +} + +template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +__device__ auto cuda_shared_to_reg_thread( + const T (&shared)[shared_size], unsigned tid, bool sync = true +) { + + static_assert(shared_size >= nt * vt, + "reg_to_shared_thread must have at least nt * vt storage"); + + cudaArray<T, vt> x; + cuda_thread_iterate<vt>([&](auto i, auto j) { + x[i] = shared[j]; + }, tid); + + if(sync) __syncthreads(); + + return x; +} + +template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +__device__ void cuda_reg_to_shared_strided( + cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true +) { + + static_assert(shared_size >= nt * vt, + "reg_to_shared_strided must have at least nt * vt storage"); + + cuda_strided_iterate<nt, vt>( + [&](auto i, auto j) { shared[j] = x[i]; }, tid + ); + + if(sync) __syncthreads(); +} + +template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +__device__ auto cuda_shared_to_reg_strided( + const T (&shared)[shared_size], unsigned tid, bool sync = true +) { + + static_assert(shared_size >= nt * vt, + "shared_to_reg_strided must have at least nt * vt storage"); + + cudaArray<T, vt> x; + cuda_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid); + if(sync) __syncthreads(); + + return x; +} + +template< + unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, + unsigned shared_size +> +__device__ auto cuda_reg_to_mem_thread( + cudaArray<T, vt> x, unsigned tid, + unsigned count, it_t mem, T (&shared)[shared_size] +) { + cuda_reg_to_shared_thread<nt>(x, tid, shared); + auto y = cuda_shared_to_reg_strided<nt, vt>(shared, tid); + cuda_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem); +} + +template< + unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, + unsigned shared_size +> +__device__ auto cuda_mem_to_reg_thread( + it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size] +) { + + auto x = cuda_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count); + cuda_reg_to_shared_strided<nt, vt>(x, tid, shared); + auto y = cuda_shared_to_reg_thread<nt, vt>(shared, tid); + return y; +} + +template<unsigned nt, unsigned vt, typename T, unsigned S> +__device__ auto cuda_shared_gather( + const T(&data)[S], cudaArray<unsigned, vt> indices, bool sync = true +) { + + static_assert(S >= nt * vt, + "shared_gather must have at least nt * vt storage"); + + cudaArray<T, vt> x; + cuda_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; }); + + if(sync) __syncthreads(); + + return x; +} + + + +// ---------------------------------------------------------------------------- +// reg<->reg +// ---------------------------------------------------------------------------- + +template<unsigned nt, unsigned vt, typename T, unsigned S> +__device__ auto cuda_reg_thread_to_strided( + cudaArray<T, vt> x, unsigned tid, T (&shared)[S] +) { + cuda_reg_to_shared_thread<nt>(x, tid, shared); + return cuda_shared_to_reg_strided<nt, vt>(shared, tid); +} + +template<unsigned nt, unsigned vt, typename T, unsigned S> +__device__ auto cuda_reg_strided_to_thread( + cudaArray<T, vt> x, unsigned tid, T (&shared)[S] +) { + cuda_reg_to_shared_strided<nt>(x, tid, shared); + return cuda_shared_to_reg_thread<nt, vt>(shared, tid); +} + +// ---------------------------------------------------------------------------- +// cudaLoadStoreIterator +// ---------------------------------------------------------------------------- + +template<typename L, typename S, typename T, typename I> +struct cudaLoadStoreIterator : std::iterator_traits<const T*> { + + L load; + S store; + I base; + + cudaLoadStoreIterator(L load_, S store_, I base_) : + load(load_), store(store_), base(base_) { } + + struct assign_t { + L load; + S store; + I index; + + __device__ assign_t& operator=(T rhs) { + static_assert(!std::is_same<S, cudaEmpty>::value, + "load_iterator is being stored to."); + store(rhs, index); + return *this; + } + __device__ operator T() const { + static_assert(!std::is_same<L, cudaEmpty>::value, + "store_iterator is being loaded from."); + return load(index); + } + }; + + __device__ assign_t operator[](I index) const { + return assign_t { load, store, base + index }; + } + + __device__ assign_t operator*() const { + return assign_t { load, store, base }; + } + + __device__ cudaLoadStoreIterator operator+(I offset) const { + cudaLoadStoreIterator cp = *this; + cp += offset; + return cp; + } + + __device__ cudaLoadStoreIterator& operator+=(I offset) { + base += offset; + return *this; + } + + __device__ cudaLoadStoreIterator operator-(I offset) const { + cudaLoadStoreIterator cp = *this; + cp -= offset; + return cp; + } + + __device__ cudaLoadStoreIterator& operator-=(I offset) { + base -= offset; + return *this; + } +}; + +//template<typename T> +//struct trivial_load_functor { +// template<typename I> +// __device__ T operator()(I index) const { +// return T(); +// } +//}; + +//template<typename T> +//struct trivial_store_functor { +// template<typename I> +// __device__ void operator()(T v, I index) const { } +//}; + +template <typename T, typename I = unsigned, typename L, typename S> +auto cuda_make_load_store_iterator(L load, S store, I base = 0) { + return cudaLoadStoreIterator<L, S, T, I>(load, store, base); +} + +template <typename T, typename I = unsigned, typename L> +auto cuda_make_load_iterator(L load, I base = 0) { + return cuda_make_load_store_iterator<T>(load, cudaEmpty(), base); +} + +template <typename T, typename I = unsigned, typename S> +auto cuda_make_store_iterator(S store, I base = 0) { + return cuda_make_load_store_iterator<T>(cudaEmpty(), store, base); +} + +// ---------------------------------------------------------------------------- +// swap +// ---------------------------------------------------------------------------- + +template<typename T> +__device__ void cuda_swap(T& a, T& b) { + auto c = a; + a = b; + b = c; +} + +// ---------------------------------------------------------------------------- +// launch kernel +// ---------------------------------------------------------------------------- + +template<typename F, typename... args_t> +__global__ void cuda_kernel(F f, args_t... args) { + f(threadIdx.x, blockIdx.x, args...); +} + +// ---------------------------------------------------------------------------- +// operators +// ---------------------------------------------------------------------------- + +template <class T> +struct cuda_plus{ + __device__ T operator()(T a, T b) const { return a + b; } +}; + + template <class T> +struct cuda_minus{ + __device__ T operator()(T a, T b) const { return a - b; } +}; + +template <class T> +struct cuda_multiplies{ + __device__ T operator()(T a, T b) const { return a * b; } +}; + +template <class T> +struct cuda_maximum{ + __device__ T operator()(T a, T b) const { return a > b ? a : b; } +}; + +template <class T> +struct cuda_minimum{ + __device__ T operator()(T a, T b) const { return a < b ? a : b; } +}; + +template <class T> +struct cuda_less{ + __device__ T operator()(T a, T b) const { return a < b; } +}; + +template <class T> +struct cuda_greater{ + __device__ T operator()(T a, T b) const { return a > b; } +}; + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/cuda/cuda_object.hpp b/myxpcs/include/taskflow_/cuda/cuda_object.hpp new file mode 100644 index 0000000..e30d3a5 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_object.hpp @@ -0,0 +1,287 @@ +#pragma once + +#include "cuda_error.hpp" + +namespace tf { + +/** +@brief per-thread object pool to manage CUDA device object + +@tparam H object type +@tparam C function object to create a library object +@tparam D function object to delete a library object + +A CUDA device object has a lifetime associated with a device, +for example, @c cudaStream_t, @c cublasHandle_t, etc. +Creating a device object is typically expensive (e.g., 10-200 ms) +and destroying it may trigger implicit device synchronization. +For applications tha intensively make use of device objects, +it is desirable to reuse them as much as possible. + +There exists an one-to-one relationship between CUDA devices in CUDA Runtime API +and CUcontexts in the CUDA Driver API within a process. +The specific context which the CUDA Runtime API uses for a device +is called the device's primary context. +From the perspective of the CUDA Runtime API, +a device and its primary context are synonymous. + +We design the device object pool in a decentralized fashion by keeping +(1) a global pool to keep track of potentially usable objects and +(2) a per-thread pool to footprint objects with shared ownership. +The global pool does not own the object and therefore does not destruct any of them. +The per-thread pool keeps the footprints of objects with shared ownership +and will destruct them if the thread holds the last reference count after it joins. +The motivation of this decentralized control is to avoid device objects +from being destroyed while the context had been destroyed due to driver shutdown. + +*/ +template <typename H, typename C, typename D> +class cudaPerThreadDeviceObjectPool { + + public: + + /** + @brief structure to store a context object + */ + struct Object { + + int device; + H value; + + Object(int); + ~Object(); + + Object(const Object&) = delete; + Object(Object&&) = delete; + }; + + private: + + // Master thread hold the storage to the pool. + // Due to some ordering, cuda context may be destroyed when the master + // program thread destroys the cuda object. + // Therefore, we use a decentralized approach to let child thread + // destroy cuda objects while the master thread only keeps a weak reference + // to those objects for reuse. + struct cudaGlobalDeviceObjectPool { + + std::shared_ptr<Object> acquire(int); + void release(int, std::weak_ptr<Object>); + + std::mutex mutex; + std::unordered_map<int, std::vector<std::weak_ptr<Object>>> pool; + }; + + public: + + /** + @brief default constructor + */ + cudaPerThreadDeviceObjectPool() = default; + + /** + @brief acquires a device object with shared ownership + */ + std::shared_ptr<Object> acquire(int); + + /** + @brief releases a device object with moved ownership + */ + void release(std::shared_ptr<Object>&&); + + /** + @brief queries the number of device objects with shared ownership + */ + size_t footprint_size() const; + + private: + + inline static cudaGlobalDeviceObjectPool _shared_pool; + + std::unordered_set<std::shared_ptr<Object>> _footprint; +}; + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject::cudaHanale definition +// ---------------------------------------------------------------------------- + +template <typename H, typename C, typename D> +cudaPerThreadDeviceObjectPool<H, C, D>::Object::Object(int d) : + device {d} { + cudaScopedDevice ctx(device); + value = C{}(); +} + +template <typename H, typename C, typename D> +cudaPerThreadDeviceObjectPool<H, C, D>::Object::~Object() { + cudaScopedDevice ctx(device); + D{}(value); +} + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject::cudaHanaldePool definition +// ---------------------------------------------------------------------------- + +template <typename H, typename C, typename D> +std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object> +cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::acquire(int d) { + std::scoped_lock<std::mutex> lock(mutex); + if(auto itr = pool.find(d); itr != pool.end()) { + while(!itr->second.empty()) { + auto sptr = itr->second.back().lock(); + itr->second.pop_back(); + if(sptr) { + return sptr; + } + } + } + return nullptr; +} + +template <typename H, typename C, typename D> +void cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::release( + int d, std::weak_ptr<Object> ptr +) { + std::scoped_lock<std::mutex> lock(mutex); + pool[d].push_back(ptr); +} + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject definition +// ---------------------------------------------------------------------------- + +template <typename H, typename C, typename D> +std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object> +cudaPerThreadDeviceObjectPool<H, C, D>::acquire(int d) { + + auto ptr = _shared_pool.acquire(d); + + if(!ptr) { + ptr = std::make_shared<Object>(d); + } + + return ptr; +} + +template <typename H, typename C, typename D> +void cudaPerThreadDeviceObjectPool<H, C, D>::release( + std::shared_ptr<Object>&& ptr +) { + _shared_pool.release(ptr->device, ptr); + _footprint.insert(std::move(ptr)); +} + +template <typename H, typename C, typename D> +size_t cudaPerThreadDeviceObjectPool<H, C, D>::footprint_size() const { + return _footprint.size(); +} + +// ---------------------------------------------------------------------------- +// cudaObject +// ---------------------------------------------------------------------------- + +/** +@class cudaObject + +@brief class to create an RAII-styled and move-only wrapper for CUDA objects +*/ +template <typename T, typename C, typename D> +class cudaObject { + + public: + + /** + @brief constructs a CUDA object from the given one + */ + explicit cudaObject(T obj) : object(obj) {} + + /** + @brief constructs a new CUDA object + */ + cudaObject() : object{ C{}() } {} + + /** + @brief disabled copy constructor + */ + cudaObject(const cudaObject&) = delete; + + /** + @brief move constructor + */ + cudaObject(cudaObject&& rhs) : object{rhs.object} { + rhs.object = nullptr; + } + + /** + @brief destructs the CUDA object + */ + ~cudaObject() { D{}(object); } + + /** + @brief disabled copy assignment + */ + cudaObject& operator = (const cudaObject&) = delete; + + /** + @brief move assignment + */ + cudaObject& operator = (cudaObject&& rhs) { + D {} (object); + object = rhs.object; + rhs.object = nullptr; + return *this; + } + + /** + @brief implicit conversion to the native CUDA stream (cudaObject_t) + + Returns the underlying stream of type @c cudaObject_t. + */ + operator T () const { + return object; + } + + /** + @brief deletes the current CUDA object (if any) and creates a new one + */ + void create() { + D {} (object); + object = C{}(); + } + + /** + @brief resets this CUDA object to the given one + */ + void reset(T new_obj) { + D {} (object); + object = new_obj; + } + + /** + @brief deletes the current CUDA object + */ + void clear() { + reset(nullptr); + } + + /** + @brief releases the ownership of the CUDA object + */ + T release() { + auto tmp = object; + object = nullptr; + return tmp; + } + + protected: + + /** + @brief the CUDA object + */ + T object; +}; + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp b/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp new file mode 100644 index 0000000..60efed1 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_optimizer.hpp @@ -0,0 +1,404 @@ +#pragma once + +#include "cuda_graph.hpp" + +/** +@file cuda_optimizer.hpp +@brief %cudaFlow capturing algorithms include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaFlowOptimizerBase +// ---------------------------------------------------------------------------- + +/** +@private + +@brief class to provide helper common methods for optimization algorithms +*/ +class cudaFlowOptimizerBase { + + protected: + + std::vector<cudaFlowNode*> _toposort(cudaFlowGraph&); + std::vector<std::vector<cudaFlowNode*>> _levelize(cudaFlowGraph&); +}; + +// Function: _toposort +inline std::vector<cudaFlowNode*> cudaFlowOptimizerBase::_toposort(cudaFlowGraph& graph) { + + std::vector<cudaFlowNode*> res; + std::queue<cudaFlowNode*> bfs; + + res.reserve(graph._nodes.size()); + + // insert the first level of nodes into the queue + for(auto& u : graph._nodes) { + + auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle); + hu->level = u->_dependents.size(); + + if(hu->level == 0) { + bfs.push(u.get()); + } + } + + // levelize the graph using bfs + while(!bfs.empty()) { + + auto u = bfs.front(); + bfs.pop(); + + res.push_back(u); + + for(auto v : u->_successors) { + auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle); + if(--hv->level == 0) { + bfs.push(v); + } + } + } + + return res; +} + +// Function: _levelize +inline std::vector<std::vector<cudaFlowNode*>> +cudaFlowOptimizerBase::_levelize(cudaFlowGraph& graph) { + + std::queue<cudaFlowNode*> bfs; + + size_t max_level = 0; + + // insert the first level of nodes into the queue + for(auto& u : graph._nodes) { + + auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle); + hu->level = u->_dependents.size(); + + if(hu->level == 0) { + bfs.push(u.get()); + } + } + + // levelize the graph using bfs + while(!bfs.empty()) { + + auto u = bfs.front(); + bfs.pop(); + + auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle); + + for(auto v : u->_successors) { + auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle); + if(--hv->level == 0) { + hv->level = hu->level + 1; + if(hv->level > max_level) { + max_level = hv->level; + } + bfs.push(v); + } + } + } + + // set level_graph and each node's idx + std::vector<std::vector<cudaFlowNode*>> level_graph(max_level+1); + for(auto& u : graph._nodes) { + auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle); + hu->lid = level_graph[hu->level].size(); + level_graph[hu->level].emplace_back(u.get()); + + //for(auto s : u->_successors) { + // assert(hu.level < std::get_if<cudaFlowNode::Capture>(&s->_handle)->level); + //} + } + + return level_graph; +} + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowSequentialOptimizer +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowSequentialOptimizer + +@brief class to capture a CUDA graph using a sequential stream + +A sequential capturing algorithm finds a topological order of +the described graph and captures dependent GPU tasks using a single stream. +All GPU tasks run sequentially without breaking inter dependencies. +*/ +class cudaFlowSequentialOptimizer : public cudaFlowOptimizerBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief constructs a sequential optimizer + */ + cudaFlowSequentialOptimizer() = default; + + private: + + cudaGraph_t _optimize(cudaFlowGraph& graph); +}; + +inline cudaGraph_t cudaFlowSequentialOptimizer::_optimize(cudaFlowGraph& graph) { + + // acquire per-thread stream and turn it into capture mode + // we must use ThreadLocal mode to avoid clashing with CUDA global states + + cudaStream stream; + + stream.begin_capture(cudaStreamCaptureModeThreadLocal); + + auto ordered = _toposort(graph); + for(auto node : ordered) { + std::get_if<cudaFlowNode::Capture>(&node->_handle)->work(stream); + } + + return stream.end_capture(); +} + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowLinearOptimizer +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowLinearOptimizer + +@brief class to capture a linear CUDA graph using a sequential stream + +A linear capturing algorithm is a special case of tf::cudaFlowSequentialOptimizer +and assumes the input task graph to be a single linear chain of tasks +(i.e., a straight line). +This assumption allows faster optimization during the capturing process. +If the input task graph is not a linear chain, the behavior is undefined. +*/ +class cudaFlowLinearOptimizer : public cudaFlowOptimizerBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief constructs a linear optimizer + */ + cudaFlowLinearOptimizer() = default; + + private: + + cudaGraph_t _optimize(cudaFlowGraph& graph); +}; + +inline cudaGraph_t cudaFlowLinearOptimizer::_optimize(cudaFlowGraph& graph) { + + // acquire per-thread stream and turn it into capture mode + // we must use ThreadLocal mode to avoid clashing with CUDA global states + cudaStream stream; + + stream.begin_capture(cudaStreamCaptureModeThreadLocal); + + // find the source node + cudaFlowNode* src {nullptr}; + for(auto& u : graph._nodes) { + if(u->_dependents.size() == 0) { + src = u.get(); + while(src) { + std::get_if<cudaFlowNode::Capture>(&src->_handle)->work(stream); + src = src->_successors.empty() ? nullptr : src->_successors[0]; + } + break; + } + // ideally, there should be only one source + } + + return stream.end_capture(); +} + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowRoundRobinOptimizer +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowRoundRobinOptimizer + +@brief class to capture a CUDA graph using a round-robin algorithm + +A round-robin capturing algorithm levelizes the user-described graph +and assign streams to nodes in a round-robin order level by level. +The algorithm is based on the following paper published in Euro-Par 2021: + + Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using %Task Graph Parallelism," <i>European Conference on Parallel and Distributed Computing (Euro-Par)</i>, 2021 + +The round-robin optimization algorithm is best suited for large %cudaFlow graphs +that compose hundreds of or thousands of GPU operations +(e.g., kernels and memory copies) with many of them being able to run in parallel. +You can configure the number of streams to the optimizer to adjust the +maximum kernel currency in the captured CUDA graph. +*/ +class cudaFlowRoundRobinOptimizer : public cudaFlowOptimizerBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief constructs a round-robin optimizer with 4 streams by default + */ + cudaFlowRoundRobinOptimizer() = default; + + /** + @brief constructs a round-robin optimizer with the given number of streams + */ + explicit cudaFlowRoundRobinOptimizer(size_t num_streams); + + /** + @brief queries the number of streams used by the optimizer + */ + size_t num_streams() const; + + /** + @brief sets the number of streams used by the optimizer + */ + void num_streams(size_t n); + + private: + + size_t _num_streams {4}; + + cudaGraph_t _optimize(cudaFlowGraph& graph); + + void _reset(std::vector<std::vector<cudaFlowNode*>>& graph); + +}; + +// Constructor +inline cudaFlowRoundRobinOptimizer::cudaFlowRoundRobinOptimizer(size_t num_streams) : + _num_streams {num_streams} { + + if(num_streams == 0) { + TF_THROW("number of streams must be at least one"); + } +} + +// Function: num_streams +inline size_t cudaFlowRoundRobinOptimizer::num_streams() const { + return _num_streams; +} + +// Procedure: num_streams +inline void cudaFlowRoundRobinOptimizer::num_streams(size_t n) { + if(n == 0) { + TF_THROW("number of streams must be at least one"); + } + _num_streams = n; +} + +inline void cudaFlowRoundRobinOptimizer::_reset( + std::vector<std::vector<cudaFlowNode*>>& graph +) { + //level == global id + //idx == stream id we want to skip + size_t id{0}; + for(auto& each_level: graph) { + for(auto& node: each_level) { + auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle); + hn->level = id++; + hn->idx = _num_streams; + hn->event = nullptr; + } + } +} + +// Function: _optimize +inline cudaGraph_t cudaFlowRoundRobinOptimizer::_optimize(cudaFlowGraph& graph) { + + // levelize the graph + auto levelized = _levelize(graph); + + // initialize the data structure + _reset(levelized); + + // begin to capture + std::vector<cudaStream> streams(_num_streams); + + streams[0].begin_capture(cudaStreamCaptureModeThreadLocal); + + // reserve space for scoped events + std::vector<cudaEvent> events; + events.reserve((_num_streams >> 1) + levelized.size()); + + // fork + cudaEvent_t fork_event = events.emplace_back(); + streams[0].record(fork_event); + + for(size_t i = 1; i < streams.size(); ++i) { + streams[i].wait(fork_event); + } + + // assign streams to levelized nodes in a round-robin manner + for(auto& each_level: levelized) { + for(auto& node: each_level) { + auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle); + size_t sid = hn->lid % _num_streams; + + //wait events + cudaFlowNode* wait_node{nullptr}; + for(auto& pn: node->_dependents) { + auto phn = std::get_if<cudaFlowNode::Capture>(&pn->_handle); + size_t psid = phn->lid % _num_streams; + + //level == global id + //idx == stream id we want to skip + if(psid == hn->idx) { + if(wait_node == nullptr || + std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->level < phn->level) { + wait_node = pn; + } + } + else if(psid != sid) { + streams[sid].wait(phn->event); + } + } + + if(wait_node != nullptr) { + assert(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event); + streams[sid].wait(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event); + } + + //capture + hn->work(streams[sid]); + + //create/record stream + for(auto& sn: node->_successors) { + auto shn = std::get_if<cudaFlowNode::Capture>(&sn->_handle); + size_t ssid = shn->lid % _num_streams; + if(ssid != sid) { + if(!hn->event) { + hn->event = events.emplace_back(); + streams[sid].record(hn->event); + } + //idx == stream id we want to skip + shn->idx = sid; + } + } + } + } + + // join + for(size_t i=1; i<_num_streams; ++i) { + cudaEvent_t join_event = events.emplace_back(); + streams[i].record(join_event); + streams[0].wait(join_event); + } + + return streams[0].end_capture(); +} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/cuda/cuda_stream.hpp b/myxpcs/include/taskflow_/cuda/cuda_stream.hpp new file mode 100644 index 0000000..f3e48f1 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_stream.hpp @@ -0,0 +1,226 @@ +#pragma once + +#include "cuda_object.hpp" + +/** +@file cuda_stream.hpp +@brief CUDA stream utilities include file +*/ + +namespace tf { + + + +// ---------------------------------------------------------------------------- +// cudaStream +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct cudaStreamCreator { + cudaStream_t operator () () const { + cudaStream_t stream; + TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream"); + return stream; + } +}; + +/** +@private +*/ +struct cudaStreamDeleter { + void operator () (cudaStream_t stream) const { + if(stream) { + cudaStreamDestroy(stream); + } + } +}; + +/** +@class cudaStream + +@brief class to create an RAII-styled wrapper over a native CUDA stream + +A cudaStream object is an RAII-styled wrapper over a native CUDA stream +(@c cudaStream_t). +A cudaStream object is move-only. +*/ +class cudaStream : + + public cudaObject <cudaStream_t, cudaStreamCreator, cudaStreamDeleter> { + + public: + + /** + @brief constructs an RAII-styled object from the given CUDA stream + + Constructs a cudaStream object which owns @c stream. + */ + explicit cudaStream(cudaStream_t stream) : cudaObject(stream) { + } + + /** + @brief default constructor + */ + cudaStream() = default; + + /** + @brief synchronizes the associated stream + + Equivalently calling @c cudaStreamSynchronize to block + until this stream has completed all operations. + */ + void synchronize() const { + TF_CHECK_CUDA( + cudaStreamSynchronize(object), "failed to synchronize a CUDA stream" + ); + } + + /** + @brief begins graph capturing on the stream + + When a stream is in capture mode, all operations pushed into the stream + will not be executed, but will instead be captured into a graph, + which will be returned via cudaStream::end_capture. + + A thread's mode can be one of the following: + + @c cudaStreamCaptureModeGlobal: This is the default mode. + If the local thread has an ongoing capture sequence that was not initiated + with @c cudaStreamCaptureModeRelaxed at @c cuStreamBeginCapture, + or if any other thread has a concurrent capture sequence initiated with + @c cudaStreamCaptureModeGlobal, this thread is prohibited from potentially + unsafe API calls. + + + @c cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture + sequence not initiated with @c cudaStreamCaptureModeRelaxed, + it is prohibited from potentially unsafe API calls. + Concurrent capture sequences in other threads are ignored. + + + @c cudaStreamCaptureModeRelaxed: The local thread is not prohibited + from potentially unsafe API calls. Note that the thread is still prohibited + from API calls which necessarily conflict with stream capture, for example, + attempting @c cudaEventQuery on an event that was last recorded + inside a capture sequence. + */ + void begin_capture(cudaStreamCaptureMode m = cudaStreamCaptureModeGlobal) const { + TF_CHECK_CUDA( + cudaStreamBeginCapture(object, m), + "failed to begin capture on stream ", object, " with thread mode ", m + ); + } + + /** + @brief ends graph capturing on the stream + + Equivalently calling @c cudaStreamEndCapture to + end capture on stream and returning the captured graph. + Capture must have been initiated on stream via a call to cudaStream::begin_capture. + If capture was invalidated, due to a violation of the rules of stream capture, + then a NULL graph will be returned. + */ + cudaGraph_t end_capture() const { + cudaGraph_t native_g; + TF_CHECK_CUDA( + cudaStreamEndCapture(object, &native_g), + "failed to end capture on stream ", object + ); + return native_g; + } + + /** + @brief records an event on the stream + + Equivalently calling @c cudaEventRecord to record an event on this stream, + both of which must be on the same CUDA context. + */ + void record(cudaEvent_t event) const { + TF_CHECK_CUDA( + cudaEventRecord(event, object), + "failed to record event ", event, " on stream ", object + ); + } + + /** + @brief waits on an event + + Equivalently calling @c cudaStreamWaitEvent to make all future work + submitted to stream wait for all work captured in event. + */ + void wait(cudaEvent_t event) const { + TF_CHECK_CUDA( + cudaStreamWaitEvent(object, event, 0), + "failed to wait for event ", event, " on stream ", object + ); + } +}; + +// ---------------------------------------------------------------------------- +// cudaEvent +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct cudaEventCreator { + + cudaEvent_t operator () () const { + cudaEvent_t event; + TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event"); + return event; + } + + cudaEvent_t operator () (unsigned int flag) const { + cudaEvent_t event; + TF_CHECK_CUDA( + cudaEventCreateWithFlags(&event, flag), + "failed to create a CUDA event with flag=", flag + ); + return event; + } +}; + +/** +@private +*/ +struct cudaEventDeleter { + void operator () (cudaEvent_t event) const { + cudaEventDestroy(event); + } +}; + +/** +@class cudaEvent + +@brief class to create an RAII-styled wrapper over a native CUDA event + +A cudaEvent object is an RAII-styled wrapper over a native CUDA event +(@c cudaEvent_t). +A cudaEvent object is move-only. +*/ +class cudaEvent : + public cudaObject<cudaEvent_t, cudaEventCreator, cudaEventDeleter> { + + public: + + /** + @brief constructs an RAII-styled CUDA event object from the given CUDA event + */ + explicit cudaEvent(cudaEvent_t event) : cudaObject(event) { } + + /** + @brief constructs an RAII-styled CUDA event object + */ + cudaEvent() = default; + + /** + @brief constructs an RAII-styled CUDA event object with the given flag + */ + explicit cudaEvent(unsigned int flag) : cudaObject(cudaEventCreator{}(flag)) { } +}; + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/cuda/cuda_task.hpp b/myxpcs/include/taskflow_/cuda/cuda_task.hpp new file mode 100644 index 0000000..92fac9c --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cuda_task.hpp @@ -0,0 +1,274 @@ +#pragma once + +#include "cuda_graph.hpp" + +/** +@file cuda_task.hpp +@brief cudaTask include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaTask Types +// ---------------------------------------------------------------------------- + +/** +@enum cudaTaskType + +@brief enumeration of all %cudaTask types +*/ +enum class cudaTaskType : int { + /** @brief empty task type */ + EMPTY = 0, + /** @brief host task type */ + HOST, + /** @brief memory set task type */ + MEMSET, + /** @brief memory copy task type */ + MEMCPY, + /** @brief memory copy task type */ + KERNEL, + /** @brief subflow (child graph) task type */ + SUBFLOW, + /** @brief capture task type */ + CAPTURE, + /** @brief undefined task type */ + UNDEFINED +}; + +/** +@brief convert a cuda_task type to a human-readable string +*/ +constexpr const char* to_string(cudaTaskType type) { + switch(type) { + case cudaTaskType::EMPTY: return "empty"; + case cudaTaskType::HOST: return "host"; + case cudaTaskType::MEMSET: return "memset"; + case cudaTaskType::MEMCPY: return "memcpy"; + case cudaTaskType::KERNEL: return "kernel"; + case cudaTaskType::SUBFLOW: return "subflow"; + case cudaTaskType::CAPTURE: return "capture"; + default: return "undefined"; + } +} + +// ---------------------------------------------------------------------------- +// cudaTask +// ---------------------------------------------------------------------------- + +/** +@class cudaTask + +@brief class to create a task handle over an internal node of a %cudaFlow graph +*/ +class cudaTask { + + friend class cudaFlow; + friend class cudaFlowCapturer; + friend class cudaFlowCapturerBase; + + friend std::ostream& operator << (std::ostream&, const cudaTask&); + + public: + + /** + @brief constructs an empty cudaTask + */ + cudaTask() = default; + + /** + @brief copy-constructs a cudaTask + */ + cudaTask(const cudaTask&) = default; + + /** + @brief copy-assigns a cudaTask + */ + cudaTask& operator = (const cudaTask&) = default; + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + cudaTask& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + cudaTask& succeed(Ts&&... tasks); + + /** + @brief assigns a name to the task + + @param name a @std_string acceptable string + + @return @c *this + */ + cudaTask& name(const std::string& name); + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors + */ + size_t num_successors() const; + + /** + @brief queries the number of dependents + */ + size_t num_dependents() const; + + /** + @brief queries if the task is associated with a cudaFlowNode + */ + bool empty() const; + + /** + @brief queries the task type + */ + cudaTaskType type() const; + + /** + @brief dumps the task through an output stream + + @tparam T output stream type with insertion operator (<<) defined + @param ostream an output stream target + */ + template <typename T> + void dump(T& ostream) const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template <typename V> + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template <typename V> + void for_each_dependent(V&& visitor) const; + + private: + + cudaTask(cudaFlowNode*); + + cudaFlowNode* _node {nullptr}; +}; + +// Constructor +inline cudaTask::cudaTask(cudaFlowNode* node) : _node {node} { +} + +// Function: precede +template <typename... Ts> +cudaTask& cudaTask::precede(Ts&&... tasks) { + (_node->_precede(tasks._node), ...); + return *this; +} + +// Function: succeed +template <typename... Ts> +cudaTask& cudaTask::succeed(Ts&&... tasks) { + (tasks._node->_precede(_node), ...); + return *this; +} + +// Function: empty +inline bool cudaTask::empty() const { + return _node == nullptr; +} + +// Function: name +inline cudaTask& cudaTask::name(const std::string& name) { + _node->_name = name; + return *this; +} + +// Function: name +inline const std::string& cudaTask::name() const { + return _node->_name; +} + +// Function: num_successors +inline size_t cudaTask::num_successors() const { + return _node->_successors.size(); +} + +// Function: num_dependents +inline size_t cudaTask::num_dependents() const { + return _node->_dependents.size(); +} + +// Function: type +inline cudaTaskType cudaTask::type() const { + switch(_node->_handle.index()) { + case cudaFlowNode::EMPTY: return cudaTaskType::EMPTY; + case cudaFlowNode::HOST: return cudaTaskType::HOST; + case cudaFlowNode::MEMSET: return cudaTaskType::MEMSET; + case cudaFlowNode::MEMCPY: return cudaTaskType::MEMCPY; + case cudaFlowNode::KERNEL: return cudaTaskType::KERNEL; + case cudaFlowNode::SUBFLOW: return cudaTaskType::SUBFLOW; + case cudaFlowNode::CAPTURE: return cudaTaskType::CAPTURE; + default: return cudaTaskType::UNDEFINED; + } +} + +// Procedure: dump +template <typename T> +void cudaTask::dump(T& os) const { + os << "cudaTask "; + if(_node->_name.empty()) os << _node; + else os << _node->_name; + os << " [type=" << to_string(type()) << ']'; +} + +// Function: for_each_successor +template <typename V> +void cudaTask::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node->_successors.size(); ++i) { + visitor(cudaTask(_node->_successors[i])); + } +} + +// Function: for_each_dependent +template <typename V> +void cudaTask::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node->_dependents.size(); ++i) { + visitor(cudaTask(_node->_dependents[i])); + } +} + +// ---------------------------------------------------------------------------- +// global ostream +// ---------------------------------------------------------------------------- + +/** +@brief overload of ostream inserter operator for cudaTask +*/ +inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) { + ct.dump(os); + return os; +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/cuda/cudaflow.hpp b/myxpcs/include/taskflow_/cuda/cudaflow.hpp new file mode 100644 index 0000000..61d5c84 --- /dev/null +++ b/myxpcs/include/taskflow_/cuda/cudaflow.hpp @@ -0,0 +1,1024 @@ +#pragma once + +#include "../taskflow.hpp" +#include "cuda_task.hpp" +#include "cuda_capturer.hpp" + +/** +@file taskflow/cuda/cudaflow.hpp +@brief cudaFlow include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// class definition: cudaFlow +// ---------------------------------------------------------------------------- + +/** +@class cudaFlow + +@brief class to create a %cudaFlow task dependency graph + +A %cudaFlow is a high-level interface over CUDA Graph to perform GPU operations +using the task dependency graph model. +The class provides a set of methods for creating and launch different tasks +on one or multiple CUDA devices, +for instance, kernel tasks, data transfer tasks, and memory operation tasks. +The following example creates a %cudaFlow of two kernel tasks, @c task1 and +@c task2, where @c task1 runs before @c task2. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +taskflow.emplace([&](tf::cudaFlow& cf){ + // create two kernel tasks + tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); + tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); + + // kernel1 runs before kernel2 + task1.precede(task2); +}); + +executor.run(taskflow).wait(); +@endcode + +A %cudaFlow is a task (tf::Task) created from tf::Taskflow +and will be run by @em one worker thread in the executor. +That is, the callable that describes a %cudaFlow +will be executed sequentially. +Inside a %cudaFlow task, different GPU tasks (tf::cudaTask) may run +in parallel scheduled by the CUDA runtime. + +Please refer to @ref GPUTaskingcudaFlow for details. +*/ +class cudaFlow { + + public: + + /** + @brief constructs a %cudaFlow + */ + cudaFlow(); + + /** + @brief destroys the %cudaFlow and its associated native CUDA graph + and executable graph + */ + ~cudaFlow() = default; + + /** + @brief default move constructor + */ + cudaFlow(cudaFlow&&) = default; + + /** + @brief default move assignment operator + */ + cudaFlow& operator = (cudaFlow&&) = default; + + /** + @brief queries the emptiness of the graph + */ + bool empty() const; + + /** + @brief queries the number of tasks + */ + size_t num_tasks() const; + + /** + @brief clears the %cudaFlow object + */ + void clear(); + + /** + @brief dumps the %cudaFlow graph into a DOT format through an + output stream + */ + void dump(std::ostream& os) const; + + /** + @brief dumps the native CUDA graph into a DOT format through an + output stream + + The native CUDA graph may be different from the upper-level %cudaFlow + graph when flow capture is involved. + */ + void dump_native_graph(std::ostream& os) const; + + // ------------------------------------------------------------------------ + // Graph building routines + // ------------------------------------------------------------------------ + + /** + @brief creates a no-operation task + + @return a tf::cudaTask handle + + An empty node performs no operation during execution, + but can be used for transitive ordering. + For example, a phased execution graph with 2 groups of @c n nodes + with a barrier between them can be represented using an empty node + and @c 2*n dependency edges, + rather than no empty node and @c n^2 dependency edges. + */ + cudaTask noop(); + + /** + @brief creates a host task that runs a callable on the host + + @tparam C callable type + + @param callable a callable object with neither arguments nor return + (i.e., constructible from @c std::function<void()>) + + @return a tf::cudaTask handle + + A host task can only execute CPU-specific functions and cannot do any CUDA calls + (e.g., @c cudaMalloc). + */ + template <typename C> + cudaTask host(C&& callable); + + /** + @brief updates parameters of a host task + + The method is similar to tf::cudaFlow::host but operates on a task + of type tf::cudaTaskType::HOST. + */ + template <typename C> + void host(cudaTask task, C&& callable); + + /** + @brief creates a kernel task + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return a tf::cudaTask handle + */ + template <typename F, typename... ArgsT> + cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args); + + /** + @brief updates parameters of a kernel task + + The method is similar to tf::cudaFlow::kernel but operates on a task + of type tf::cudaTaskType::KERNEL. + The kernel function name must NOT change. + */ + template <typename F, typename... ArgsT> + void kernel( + cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args + ); + + /** + @brief creates a memset task that fills untyped data with a byte value + + @param dst pointer to the destination device memory area + @param v value to set for each byte of specified memory + @param count size in bytes to set + + @return a tf::cudaTask handle + + A memset task fills the first @c count bytes of device memory area + pointed by @c dst with the byte value @c v. + */ + cudaTask memset(void* dst, int v, size_t count); + + /** + @brief updates parameters of a memset task + + The method is similar to tf::cudaFlow::memset but operates on a task + of type tf::cudaTaskType::MEMSET. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void memset(cudaTask task, void* dst, int ch, size_t count); + + /** + @brief creates a memcpy task that copies untyped data in bytes + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param bytes bytes to copy + + @return a tf::cudaTask handle + + A memcpy task transfers @c bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + cudaTask memcpy(void* tgt, const void* src, size_t bytes); + + /** + @brief updates parameters of a memcpy task + + The method is similar to tf::cudaFlow::memcpy but operates on a task + of type tf::cudaTaskType::MEMCPY. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes); + + /** + @brief creates a memset task that sets a typed memory block to zero + + @tparam T element type (size of @c T must be either 1, 2, or 4) + @param dst pointer to the destination device memory area + @param count number of elements + + @return a tf::cudaTask handle + + A zero task zeroes the first @c count elements of type @c T + in a device memory area pointed by @c dst. + */ + template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask zero(T* dst, size_t count); + + /** + @brief updates parameters of a memset task to a zero task + + The method is similar to tf::cudaFlow::zero but operates on + a task of type tf::cudaTaskType::MEMSET. + + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void zero(cudaTask task, T* dst, size_t count); + + /** + @brief creates a memset task that fills a typed memory block with a value + + @tparam T element type (size of @c T must be either 1, 2, or 4) + + @param dst pointer to the destination device memory area + @param value value to fill for each element of type @c T + @param count number of elements + + @return a tf::cudaTask handle + + A fill task fills the first @c count elements of type @c T with @c value + in a device memory area pointed by @c dst. + The value to fill is interpreted in type @c T rather than byte. + */ + template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask fill(T* dst, T value, size_t count); + + /** + @brief updates parameters of a memset task to a fill task + + The method is similar to tf::cudaFlow::fill but operates on a task + of type tf::cudaTaskType::MEMSET. + + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void fill(cudaTask task, T* dst, T value, size_t count); + + /** + @brief creates a memcopy task that copies typed data + + @tparam T element type (non-void) + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param num number of elements to copy + + @return a tf::cudaTask handle + + A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + cudaTask copy(T* tgt, const T* src, size_t num); + + /** + @brief updates parameters of a memcpy task to a copy task + + The method is similar to tf::cudaFlow::copy but operates on a task + of type tf::cudaTaskType::MEMCPY. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + void copy(cudaTask task, T* tgt, const T* src, size_t num); + + // ------------------------------------------------------------------------ + // run method + // ------------------------------------------------------------------------ + /** + @brief offloads the %cudaFlow onto a GPU asynchronously via a stream + + @param stream stream for performing this operation + + Offloads the present %cudaFlow onto a GPU asynchronously via + the given stream. + + An offloaded %cudaFlow forces the underlying graph to be instantiated. + After the instantiation, you should not modify the graph topology + but update node parameters. + */ + void run(cudaStream_t stream); + + /** + @brief acquires a reference to the underlying CUDA graph + */ + cudaGraph_t native_graph(); + + /** + @brief acquires a reference to the underlying CUDA graph executable + */ + cudaGraphExec_t native_executable(); + + // ------------------------------------------------------------------------ + // generic algorithms + // ------------------------------------------------------------------------ + + /** + @brief runs a callable with only a single kernel thread + + @tparam C callable type + + @param c callable to run by a single kernel thread + + @return a tf::cudaTask handle + */ + template <typename C> + cudaTask single_task(C c); + + /** + @brief updates a single-threaded kernel task + + This method is similar to cudaFlow::single_task but operates + on an existing task. + */ + template <typename C> + void single_task(cudaTask task, C c); + + /** + @brief applies a callable to each dereferenced element of the data array + + @tparam I iterator type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + for(auto itr = first; itr != last; itr++) { + callable(*itr); + } + @endcode + */ + template <typename I, typename C> + cudaTask for_each(I first, I last, C callable); + + /** + @brief updates parameters of a kernel task created from + tf::cudaFlow::for_each + + The type of the iterators and the callable must be the same as + the task created from tf::cudaFlow::for_each. + */ + template <typename I, typename C> + void for_each(cudaTask task, I first, I last, C callable); + + /** + @brief applies a callable to each index in the range with the step size + + @tparam I index type + @tparam C callable type + + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + // step is positive [first, last) + for(auto i=first; i<last; i+=step) { + callable(i); + } + + // step is negative [first, last) + for(auto i=first; i>last; i+=step) { + callable(i); + } + @endcode + */ + template <typename I, typename C> + cudaTask for_each_index(I first, I last, I step, C callable); + + /** + @brief updates parameters of a kernel task created from + tf::cudaFlow::for_each_index + + The type of the iterators and the callable must be the same as + the task created from tf::cudaFlow::for_each_index. + */ + template <typename I, typename C> + void for_each_index( + cudaTask task, I first, I last, I step, C callable + ); + + /** + @brief applies a callable to a source range and stores the result in a target range + + @tparam I input iterator type + @tparam O output iterator type + @tparam C unary operator type + + @param first iterator to the beginning of the input range + @param last iterator to the end of the input range + @param output iterator to the beginning of the output range + @param op the operator to apply to transform each element in the range + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *output++ = callable(*first++); + } + @endcode + */ + template <typename I, typename O, typename C> + cudaTask transform(I first, I last, O output, C op); + + /** + @brief updates parameters of a kernel task created from + tf::cudaFlow::transform + + The type of the iterators and the callable must be the same as + the task created from tf::cudaFlow::for_each. + */ + template <typename I, typename O, typename C> + void transform(cudaTask task, I first, I last, O output, C c); + + /** + @brief creates a task to perform parallel transforms over two ranges of items + + @tparam I1 first input iterator type + @tparam I2 second input iterator type + @tparam O output iterator type + @tparam C unary operator type + + @param first1 iterator to the beginning of the input range + @param last1 iterator to the end of the input range + @param first2 iterato + @param output iterator to the beginning of the output range + @param op binary operator to apply to transform each pair of items in the + two input ranges + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first1 != last1) { + *output++ = op(*first1++, *first2++); + } + @endcode + */ + template <typename I1, typename I2, typename O, typename C> + cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op); + + /** + @brief updates parameters of a kernel task created from + tf::cudaFlow::transform + + The type of the iterators and the callable must be the same as + the task created from tf::cudaFlow::for_each. + */ + template <typename I1, typename I2, typename O, typename C> + void transform( + cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c + ); + + // ------------------------------------------------------------------------ + // subflow + // ------------------------------------------------------------------------ + + /** + @brief constructs a subflow graph through tf::cudaFlowCapturer + + @tparam C callable type constructible from + @c std::function<void(tf::cudaFlowCapturer&)> + + @param callable the callable to construct a capture flow + + @return a tf::cudaTask handle + + A captured subflow forms a sub-graph to the %cudaFlow and can be used to + capture custom (or third-party) kernels that cannot be directly constructed + from the %cudaFlow. + + Example usage: + + @code{.cpp} + taskflow.emplace([&](tf::cudaFlow& cf){ + + tf::cudaTask my_kernel = cf.kernel(my_arguments); + + // create a flow capturer to capture custom kernels + tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ + capturer.on([&](cudaStream_t stream){ + invoke_custom_kernel_with_stream(stream, custom_arguments); + }); + }); + + my_kernel.precede(my_subflow); + }); + @endcode + */ + template <typename C> + cudaTask capture(C&& callable); + + /** + @brief updates the captured child graph + + The method is similar to tf::cudaFlow::capture but operates on a task + of type tf::cudaTaskType::SUBFLOW. + The new captured graph must be topologically identical to the original + captured graph. + */ + template <typename C> + void capture(cudaTask task, C callable); + + private: + + cudaFlowGraph _cfg; + cudaGraphExec _exe {nullptr}; +}; + +// Construct a standalone cudaFlow +inline cudaFlow::cudaFlow() { + _cfg._native_handle.create(); +} + +// Procedure: clear +inline void cudaFlow::clear() { + _exe.clear(); + _cfg.clear(); + _cfg._native_handle.create(); +} + +// Function: empty +inline bool cudaFlow::empty() const { + return _cfg._nodes.empty(); +} + +// Function: num_tasks +inline size_t cudaFlow::num_tasks() const { + return _cfg._nodes.size(); +} + +// Procedure: dump +inline void cudaFlow::dump(std::ostream& os) const { + _cfg.dump(os, nullptr, ""); +} + +// Procedure: dump +inline void cudaFlow::dump_native_graph(std::ostream& os) const { + cuda_dump_graph(os, _cfg._native_handle); +} + +// ---------------------------------------------------------------------------- +// Graph building methods +// ---------------------------------------------------------------------------- + +// Function: noop +inline cudaTask cudaFlow::noop() { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Empty>{} + ); + + TF_CHECK_CUDA( + cudaGraphAddEmptyNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0 + ), + "failed to create a no-operation (empty) node" + ); + + return cudaTask(node); +} + +// Function: host +template <typename C> +cudaTask cudaFlow::host(C&& c) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Host>{}, std::forward<C>(c) + ); + + auto h = std::get_if<cudaFlowNode::Host>(&node->_handle); + + cudaHostNodeParams p; + p.fn = cudaFlowNode::Host::callback; + p.userData = h; + + TF_CHECK_CUDA( + cudaGraphAddHostNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a host node" + ); + + return cudaTask(node); +} + +// Function: kernel +template <typename F, typename... ArgsT> +cudaTask cudaFlow::kernel( + dim3 g, dim3 b, size_t s, F f, ArgsT... args +) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Kernel>{}, (void*)f + ); + + cudaKernelNodeParams p; + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphAddKernelNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a kernel task" + ); + + return cudaTask(node); +} + +// Function: zero +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaFlow::zero(T* dst, size_t count) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Memset>{} + ); + + auto p = cuda_get_zero_parms(dst, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a memset (zero) task" + ); + + return cudaTask(node); +} + +// Function: fill +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaFlow::fill(T* dst, T value, size_t count) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Memset>{} + ); + + auto p = cuda_get_fill_parms(dst, value, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a memset (fill) task" + ); + + return cudaTask(node); +} + +// Function: copy +template < + typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* +> +cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Memcpy>{} + ); + + auto p = cuda_get_copy_parms(tgt, src, num); + + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a memcpy (copy) task" + ); + + return cudaTask(node); +} + +// Function: memset +inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Memset>{} + ); + + auto p = cuda_get_memset_parms(dst, ch, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a memset task" + ); + + return cudaTask(node); +} + +// Function: memcpy +inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) { + + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Memcpy>{} + ); + + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode( + &node->_native_handle, _cfg._native_handle, nullptr, 0, &p + ), + "failed to create a memcpy task" + ); + + return cudaTask(node); +} + +// ------------------------------------------------------------------------ +// update methods +// ------------------------------------------------------------------------ + +// Function: host +template <typename C> +void cudaFlow::host(cudaTask task, C&& c) { + + if(task.type() != cudaTaskType::HOST) { + TF_THROW(task, " is not a host task"); + } + + auto h = std::get_if<cudaFlowNode::Host>(&task._node->_handle); + + h->func = std::forward<C>(c); +} + +// Function: update kernel parameters +template <typename F, typename... ArgsT> +void cudaFlow::kernel( + cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT... args +) { + + if(task.type() != cudaTaskType::KERNEL) { + TF_THROW(task, " is not a kernel task"); + } + + cudaKernelNodeParams p; + + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphExecKernelNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update kernel parameters on ", task + ); +} + +// Function: update copy parameters +template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*> +void cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num) { + + if(task.type() != cudaTaskType::MEMCPY) { + TF_THROW(task, " is not a memcpy task"); + } + + auto p = cuda_get_copy_parms(tgt, src, num); + + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update memcpy parameters on ", task + ); +} + +// Function: update memcpy parameters +inline void cudaFlow::memcpy( + cudaTask task, void* tgt, const void* src, size_t bytes +) { + + if(task.type() != cudaTaskType::MEMCPY) { + TF_THROW(task, " is not a memcpy task"); + } + + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update memcpy parameters on ", task + ); +} + +// Procedure: memset +inline void cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count) { + + if(task.type() != cudaTaskType::MEMSET) { + TF_THROW(task, " is not a memset task"); + } + + auto p = cuda_get_memset_parms(dst, ch, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update memset parameters on ", task + ); +} + +// Procedure: fill +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaFlow::fill(cudaTask task, T* dst, T value, size_t count) { + + if(task.type() != cudaTaskType::MEMSET) { + TF_THROW(task, " is not a memset task"); + } + + auto p = cuda_get_fill_parms(dst, value, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update memset parameters on ", task + ); +} + +// Procedure: zero +template <typename T, std::enable_if_t< + is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaFlow::zero(cudaTask task, T* dst, size_t count) { + + if(task.type() != cudaTaskType::MEMSET) { + TF_THROW(task, " is not a memset task"); + } + + auto p = cuda_get_zero_parms(dst, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams(_exe, task._node->_native_handle, &p), + "failed to update memset parameters on ", task + ); +} + +// Function: capture +template <typename C> +void cudaFlow::capture(cudaTask task, C c) { + + if(task.type() != cudaTaskType::SUBFLOW) { + TF_THROW(task, " is not a subflow task"); + } + + // insert a subflow node + // construct a captured flow from the callable + auto node_handle = std::get_if<cudaFlowNode::Subflow>(&task._node->_handle); + //node_handle->graph.clear(); + + cudaFlowCapturer capturer; + c(capturer); + + // obtain the optimized captured graph + capturer._cfg._native_handle.reset(capturer.capture()); + node_handle->cfg = std::move(capturer._cfg); + + TF_CHECK_CUDA( + cudaGraphExecChildGraphNodeSetParams( + _exe, + task._node->_native_handle, + node_handle->cfg._native_handle + ), + "failed to update a captured child graph" + ); +} + +// ---------------------------------------------------------------------------- +// captured flow +// ---------------------------------------------------------------------------- + +// Function: capture +template <typename C> +cudaTask cudaFlow::capture(C&& c) { + + // insert a subflow node + auto node = _cfg.emplace_back( + _cfg, std::in_place_type_t<cudaFlowNode::Subflow>{} + ); + + // construct a captured flow from the callable + auto node_handle = std::get_if<cudaFlowNode::Subflow>(&node->_handle); + + // perform capturing + cudaFlowCapturer capturer; + c(capturer); + + // obtain the optimized captured graph + capturer._cfg._native_handle.reset(capturer.capture()); + + // move capturer's cudaFlow graph into node + node_handle->cfg = std::move(capturer._cfg); + + TF_CHECK_CUDA( + cudaGraphAddChildGraphNode( + &node->_native_handle, + _cfg._native_handle, + nullptr, + 0, + node_handle->cfg._native_handle + ), + "failed to add a cudaFlow capturer task" + ); + + return cudaTask(node); +} + +// ---------------------------------------------------------------------------- +// run method +// ---------------------------------------------------------------------------- + +// Procedure: run +inline void cudaFlow::run(cudaStream_t stream) { + if(!_exe) { + _exe.instantiate(_cfg._native_handle); + } + _exe.launch(stream); + _cfg._state = cudaFlowGraph::OFFLOADED; +} + +// Function: native_cfg +inline cudaGraph_t cudaFlow::native_graph() { + return _cfg._native_handle; +} + +// Function: native_executable +inline cudaGraphExec_t cudaFlow::native_executable() { + return _exe; +} + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/dsl/connection.hpp b/myxpcs/include/taskflow_/dsl/connection.hpp new file mode 100644 index 0000000..e4dad72 --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/connection.hpp @@ -0,0 +1,53 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "task_trait.hpp" +#include "tuple_utils.hpp" +#include "type_list.hpp" + +namespace tf { +namespace dsl { +template <typename F, typename T> class Connection { + using FROMs = typename TaskTrait<F>::TaskList; + using TOs = typename TaskTrait<T>::TaskList; + +public: + using FromTaskList = Unique_t<Flatten_t<FROMs>>; + using ToTaskList = Unique_t<Flatten_t<TOs>>; +}; + +template <typename T, typename OUT = TypeList<>> struct Chain; + +template <typename F, typename OUT> struct Chain<auto (*)(F)->void, OUT> { + using From = F; + using type = OUT; +}; + +template <typename F, typename T, typename OUT> +struct Chain<auto (*)(F)->T, OUT> { +private: + using To = typename Chain<T, OUT>::From; + +public: + using From = F; + using type = typename Chain< + T, typename OUT::template appendTo<Connection<From, To>>>::type; +}; + +template <typename FROM, typename TO> struct OneToOneLink { + template <typename TasksCB> struct InstanceType { + constexpr void build(TasksCB &tasksCb) { + constexpr size_t TasksCBSize = std::tuple_size<TasksCB>::value; + constexpr size_t FromTaskIndex = + TupleElementByF_v<TasksCB, IsTask<FROM>::template apply>; + constexpr size_t ToTaskIndex = + TupleElementByF_v<TasksCB, IsTask<TO>::template apply>; + static_assert(FromTaskIndex < TasksCBSize && ToTaskIndex < TasksCBSize, + "fatal: not find TaskCb in TasksCB"); + std::get<FromTaskIndex>(tasksCb).task_.precede( + std::get<ToTaskIndex>(tasksCb).task_); + } + }; +}; +} // namespace dsl +}; // namespace tf diff --git a/myxpcs/include/taskflow_/dsl/dsl.hpp b/myxpcs/include/taskflow_/dsl/dsl.hpp new file mode 100644 index 0000000..e4130e8 --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/dsl.hpp @@ -0,0 +1,13 @@ +// TaskflowDSL is an experimental project that leverages C++17 to +// provide a dedicated interface for expressive taskflow programming +// +// Created by netcan: https://github.com/netcan + +#pragma once + +#include "dsl/task_dsl.hpp" + +namespace tf { + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/dsl/meta_macro.hpp b/myxpcs/include/taskflow_/dsl/meta_macro.hpp new file mode 100644 index 0000000..758bf68 --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/meta_macro.hpp @@ -0,0 +1,72 @@ +// 2020/08/30 - Created by netcan: https://github.com/netcan +// ref https://github.com/Erlkoenig90/map-macro/ +#pragma once +#ifdef _MSC_VER +#define TF_EMPTY() +#define TF_GET_ARG_COUNT_(...) \ + TF_PASTE(TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, \ + 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, \ + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \ + 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, \ + 6, 5, 4, 3, 2, 1, 0, ), \ + TF_EMPTY()) + +#else +#define TF_GET_ARG_COUNT_(...) \ + TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, \ + 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \ + 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, \ + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ) +#endif + +#define TF_GET_ARG_COUNT(...) TF_GET_ARG_COUNT_(__dummy__, ##__VA_ARGS__) +#define TF_GET_ARG_COUNT_I( \ + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, \ + e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, \ + e32, e33, e34, e35, e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, \ + e47, e48, e49, e50, e51, e52, e53, e54, e55, e56, e57, e58, e59, e60, e61, \ + e62, e63, e64, size, ...) \ + size + +#define TF_GET_FIRST(a, ...) a +#define TF_GET_SECOND(a, b, ...) b +#define TF_CONCATE(x, y) x##y +#define TF_PASTE(x, y) TF_CONCATE(x, y) + +#define TF_EVAL0(...) __VA_ARGS__ +#define TF_EVAL1(...) TF_EVAL0(TF_EVAL0(TF_EVAL0(__VA_ARGS__))) +#define TF_EVAL2(...) TF_EVAL1(TF_EVAL1(TF_EVAL1(__VA_ARGS__))) +#define TF_EVAL3(...) TF_EVAL2(TF_EVAL2(TF_EVAL2(__VA_ARGS__))) +#define TF_EVAL4(...) TF_EVAL3(TF_EVAL3(TF_EVAL3(__VA_ARGS__))) +#define TF_EVAL5(...) TF_EVAL4(TF_EVAL4(TF_EVAL4(__VA_ARGS__))) + +#ifdef _MSC_VER +// MSVC needs more evaluations +#define TF_EVAL6(...) TF_EVAL5(TF_EVAL5(TF_EVAL5(__VA_ARGS__))) +#define TF_EVAL(...) TF_EVAL6(TF_EVAL6(__VA_ARGS__)) +#else +#define TF_EVAL(...) TF_EVAL5(__VA_ARGS__) +#endif + +#define TF_MAP_END(...) +#define TF_MAP_OUT + +#define EMPTY() +#define DEFER(id) id EMPTY() + +#define TF_MAP_GET_END2() 0, TF_MAP_END +#define TF_MAP_GET_END1(...) TF_MAP_GET_END2 +#define TF_MAP_GET_END(...) TF_MAP_GET_END1 +#define TF_MAP_NEXT0(test, next, ...) next TF_MAP_OUT +#define TF_MAP_NEXT1(test, next) DEFER(TF_MAP_NEXT0)(test, next, 0) +#define TF_MAP_NEXT(test, next) TF_MAP_NEXT1(TF_MAP_GET_END test, next) + +#define TF_MAP0(f, x, peek, ...) \ + f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP1))(f, peek, __VA_ARGS__) +#define TF_MAP1(f, x, peek, ...) \ + f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP0))(f, peek, __VA_ARGS__) + +#define TF_MAP(f, ...) \ + TF_EVAL(TF_MAP1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) diff --git a/myxpcs/include/taskflow_/dsl/task_analyzer.hpp b/myxpcs/include/taskflow_/dsl/task_analyzer.hpp new file mode 100644 index 0000000..295c50b --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/task_analyzer.hpp @@ -0,0 +1,40 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "connection.hpp" +#include "type_list.hpp" +#include <type_traits> + +namespace tf { +namespace dsl { +template <typename... Links> class TaskAnalyzer { + template <typename FROMs, typename TOs, typename = void> + struct BuildOneToOneLink; + + template <typename... Fs, typename Ts> + struct BuildOneToOneLink<TypeList<Fs...>, Ts> { + using type = Concat_t<typename BuildOneToOneLink<Fs, Ts>::type...>; + }; + + template <typename F, typename... Ts> + struct BuildOneToOneLink<F, TypeList<Ts...>, + std::enable_if_t<!IsTypeList_v<F>>> { + using type = TypeList<OneToOneLink<F, Ts>...>; + }; + + template <typename Link> class OneToOneLinkSetF { + using FromTaskList = typename Link::FromTaskList; + using ToTaskList = typename Link::ToTaskList; + + public: + using type = typename BuildOneToOneLink<FromTaskList, ToTaskList>::type; + }; + +public: + using AllTasks = Unique_t< + Concat_t<typename Links::FromTaskList..., typename Links::ToTaskList...>>; + using OneToOneLinkSet = + Unique_t<Flatten_t<Map_t<TypeList<Links...>, OneToOneLinkSetF>>>; +}; + +} // namespace dsl +} // namespace tf diff --git a/myxpcs/include/taskflow_/dsl/task_dsl.hpp b/myxpcs/include/taskflow_/dsl/task_dsl.hpp new file mode 100644 index 0000000..9b362cf --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/task_dsl.hpp @@ -0,0 +1,104 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "meta_macro.hpp" +#include "task_analyzer.hpp" +#include "task_trait.hpp" + +namespace tf { +namespace dsl { +struct EmptyContext {}; +template <typename CONTEXT = EmptyContext, typename... Chains> class TaskDsl { + using Links = Unique_t<Flatten_t<TypeList<typename Chain<Chains>::type...>>>; + using Analyzer = typename Links::template exportTo<TaskAnalyzer>; + + using AllTasks = typename Analyzer::AllTasks; + + template <typename TASK> struct TaskCbWithContext { + using type = TaskCb<TASK, CONTEXT>; + }; + using TasksCB = + typename Map_t<AllTasks, + TaskCbWithContext>::template exportTo<std::tuple>; + + using OneToOneLinkSet = typename Analyzer::OneToOneLinkSet; + template <typename OneToOneLink> struct OneToOneLinkInstanceType { + using type = typename OneToOneLink::template InstanceType<TasksCB>; + }; + using OneToOneLinkInstances = + typename Map_t<OneToOneLinkSet, + OneToOneLinkInstanceType>::template exportTo<std::tuple>; + +public: + constexpr TaskDsl(FlowBuilder &flow_builder, const CONTEXT &context = {}) { + build_tasks_cb(flow_builder, context, + std::make_index_sequence<AllTasks::size>{}); + build_links(std::make_index_sequence<OneToOneLinkSet::size>{}); + } + + template <typename TASK> Task &get_task() { + constexpr size_t TasksCBSize = std::tuple_size<TasksCB>::value; + constexpr size_t TaskIndex = + TupleElementByF_v<TasksCB, IsTask<TASK>::template apply>; + static_assert(TaskIndex < TasksCBSize, "fatal: not find TaskCb in TasksCB"); + return std::get<TaskIndex>(tasksCb_).task_; + } + +private: + template <size_t... Is> + void build_tasks_cb(FlowBuilder &flow_builder, const CONTEXT &context, + std::index_sequence<Is...>) { + auto _ = {0, (std::get<Is>(tasksCb_).build(flow_builder, context), 0)...}; + (void)_; + } + + template <size_t... Is> void build_links(std::index_sequence<Is...>) { + auto _ = {0, (std::get<Is>(links_).build(tasksCb_), 0)...}; + (void)_; + } + +private: + TasksCB tasksCb_; + OneToOneLinkInstances links_; +}; + +template <typename = void, typename... Chains, typename CONTEXT = EmptyContext> +constexpr TaskDsl<CONTEXT, Chains...> taskDsl(FlowBuilder &flow_builder, + CONTEXT &&context = {}) { + return {flow_builder, context}; +} + +} // namespace dsl +} // namespace tf + +/////////////////////////////////////////////////////////////////////////////// +#define TF_CHAIN(link) , link->void +#define TF_CONTEXT_1(name) tf::dsl::EmptyContext +#define TF_CONTEXT_2(name, context) context +#define TF_CAPTURE_THIS_1 +#define TF_CAPTURE_THIS_2 *this + +/////////////////////////////////////////////////////////////////////////////// +// make_task(TASK_NAME, { return a action lambda }) +#define make_task(name, ...) \ + struct TF_GET_FIRST name : tf::dsl::TaskSignature, \ + TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name) \ + name { \ + using _ContextType = TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name) name; \ + TF_GET_FIRST name(const _ContextType &context) : _ContextType(context) {} \ + auto operator()() { \ + return [TF_PASTE(TF_CAPTURE_THIS_, TF_GET_ARG_COUNT name)] __VA_ARGS__; \ + } \ + } + +// some_tasks(A, B, C) means SomeTask +#define some_tasks(...) auto (*)(tf::dsl::SomeTask<__VA_ARGS__>) +// same as some_tasks +#define fork_tasks(...) some_tasks(__VA_ARGS__) +// same as some_tasks +#define merge_tasks(...) some_tasks(__VA_ARGS__) +// task(A) means a task A +#define task(Task) auto (*)(Task) +// taskbuild(...) build a task dsl graph +#define build_taskflow(...) tf::dsl::taskDsl<void TF_MAP(TF_CHAIN, __VA_ARGS__)> + diff --git a/myxpcs/include/taskflow_/dsl/task_trait.hpp b/myxpcs/include/taskflow_/dsl/task_trait.hpp new file mode 100644 index 0000000..bc8eeb6 --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/task_trait.hpp @@ -0,0 +1,46 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "../core/task.hpp" +#include "type_list.hpp" +#include <type_traits> + +namespace tf { +namespace dsl { +struct TaskSignature {}; + +template <typename TASK, typename CONTEXT> struct TaskCb { + using TaskType = TASK; + void build(FlowBuilder &build, const CONTEXT &context) { + task_ = build.emplace(TaskType{context}()); + } + + Task task_; +}; + +template <typename TASK> struct IsTask { + template <typename TaskCb> struct apply { + constexpr static bool value = + std::is_same<typename TaskCb::TaskType, TASK>::value; + }; +}; + +template <typename TASK, typename = void> struct TaskTrait; + +template <typename... TASK> struct SomeTask { + using TaskList = + Unique_t<Flatten_t<TypeList<typename TaskTrait<TASK>::TaskList...>>>; +}; + +// a task self +template <typename TASK> +struct TaskTrait< + TASK, std::enable_if_t<std::is_base_of<TaskSignature, TASK>::value>> { + using TaskList = TypeList<TASK>; +}; + +template <typename... TASK> struct TaskTrait<SomeTask<TASK...>> { + using TaskList = typename SomeTask<TASK...>::TaskList; +}; +} // namespace dsl +} // namespace tf diff --git a/myxpcs/include/taskflow_/dsl/tuple_utils.hpp b/myxpcs/include/taskflow_/dsl/tuple_utils.hpp new file mode 100644 index 0000000..633ba0e --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/tuple_utils.hpp @@ -0,0 +1,43 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include <cstddef> +#include <tuple> + +namespace tf { +namespace dsl { +namespace detail { +// get tuple element index by f, if not exists then index >= tuple_size +template <typename TUP, template <typename> class F, typename = void> +struct TupleElementByF { + constexpr static size_t Index = 0; +}; + +template <template <typename> class F, typename H, typename... Ts> +struct TupleElementByF<std::tuple<H, Ts...>, F, std::enable_if_t<F<H>::value>> { + constexpr static size_t Index = 0; +}; + +template <template <typename> class F, typename H, typename... Ts> +struct TupleElementByF<std::tuple<H, Ts...>, F, + std::enable_if_t<!F<H>::value>> { + constexpr static size_t Index = + 1 + TupleElementByF<std::tuple<Ts...>, F>::Index; +}; + +template <typename T, typename TUP, size_t... Is> +constexpr inline T AggregationByTupImpl(TUP &&tup, std::index_sequence<Is...>) { + return T{std::get<Is>(tup)...}; +} +} // namespace detail + +template <typename TUP, template <typename> class F> +constexpr size_t TupleElementByF_v = detail::TupleElementByF<TUP, F>::Index; + +template <typename T, typename TUP> +constexpr inline T AggregationByTup(TUP &&tup) { + return detail::AggregationByTupImpl<T>( + std::forward<TUP>(tup), + std::make_index_sequence<std::tuple_size<std::decay_t<TUP>>::size>{}); +} +} // namespace dsl +} // namespace tf diff --git a/myxpcs/include/taskflow_/dsl/type_list.hpp b/myxpcs/include/taskflow_/dsl/type_list.hpp new file mode 100644 index 0000000..c4af4a4 --- /dev/null +++ b/myxpcs/include/taskflow_/dsl/type_list.hpp @@ -0,0 +1,136 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include <cstddef> + +namespace tf { +namespace dsl { +template <typename...> using void_t = void; + +template <typename... Ts> struct TypeList { + using type = TypeList<Ts...>; + static constexpr size_t size = 0; + + template <typename... T> struct append { using type = TypeList<T...>; }; + template <typename... T> using appendTo = typename append<T...>::type; + + template <typename T> using prepend = typename TypeList<T>::type; + + template <template <typename...> class T> using exportTo = T<Ts...>; +}; + +template <typename Head, typename... Tails> struct TypeList<Head, Tails...> { + using type = TypeList<Head, Tails...>; + using head = Head; + using tails = TypeList<Tails...>; + static constexpr size_t size = sizeof...(Tails) + 1; + + template <typename... Ts> struct append { + using type = TypeList<Head, Tails..., Ts...>; + }; + template <typename... Ts> using appendTo = typename append<Ts...>::type; + + template <typename T> + using prepend = typename TypeList<T, Head, Tails...>::type; + + template <template <typename...> class T> using exportTo = T<Head, Tails...>; +}; + +template <typename IN> struct IsTypeList { + constexpr static bool value = false; +}; + +template <typename IN> constexpr bool IsTypeList_v = IsTypeList<IN>::value; + +template <typename... Ts> struct IsTypeList<TypeList<Ts...>> { + constexpr static bool value = true; +}; + +template <typename... IN> struct Concat; + +template <typename... IN> using Concat_t = typename Concat<IN...>::type; + +template <> struct Concat<> { using type = TypeList<>; }; +template <typename IN> struct Concat<IN> { using type = IN; }; + +template <typename IN, typename IN2> struct Concat<IN, IN2> { + using type = typename IN2::template exportTo<IN::template append>::type; +}; + +template <typename IN, typename IN2, typename... Rest> +struct Concat<IN, IN2, Rest...> { + using type = Concat_t<Concat_t<IN, IN2>, Rest...>; +}; + +template <typename IN, typename OUT = TypeList<>, typename = void> +struct Flatten { + using type = OUT; +}; + +template <typename IN> using Flatten_t = typename Flatten<IN>::type; + +template <typename IN, typename OUT> +struct Flatten<IN, OUT, std::enable_if_t<IsTypeList_v<typename IN::head>>> { + using type = + typename Flatten<typename IN::tails, + Concat_t<OUT, Flatten_t<typename IN::head>>>::type; +}; + +template <typename IN, typename OUT> +struct Flatten<IN, OUT, std::enable_if_t<!IsTypeList_v<typename IN::head>>> { + using type = typename Flatten< + typename IN::tails, + typename OUT::template appendTo<typename IN::head>>::type; +}; + +template <typename IN, template <typename> class F> struct Map { + using type = TypeList<>; +}; + +template <typename IN, template <typename> class F> +using Map_t = typename Map<IN, F>::type; + +template <template <typename> class F, typename... Ts> +struct Map<TypeList<Ts...>, F> { + using type = TypeList<typename F<Ts>::type...>; +}; + +template <typename IN, template <typename> class F, typename OUT = TypeList<>, + typename = void> +struct Filter { + using type = OUT; +}; + +template <typename IN, template <typename> class F> +using Filter_t = typename Filter<IN, F>::type; + +template <typename IN, template <typename> class F, typename OUT> +class Filter<IN, F, OUT, void_t<typename IN::head>> { + using H = typename IN::head; + +public: + using type = typename std::conditional_t< + F<H>::value, + Filter<typename IN::tails, F, typename OUT::template appendTo<H>>, + Filter<typename IN::tails, F, OUT>>::type; +}; + +template <typename IN, typename = void> struct Unique { using type = IN; }; + +template <typename IN> using Unique_t = typename Unique<IN>::type; + +template <typename IN> class Unique<IN, void_t<typename IN::head>> { + template <typename T> struct IsDifferR { + template <typename R> struct apply { + static constexpr bool value = !std::is_same<T, R>::value; + }; + }; + + using tails = Unique_t<typename IN::tails>; + using eraseHead = + Filter_t<tails, IsDifferR<typename IN::head>::template apply>; + +public: + using type = typename eraseHead::template prepend<typename IN::head>; +}; +} // namespace dsl +} // namespace tf diff --git a/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp b/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp new file mode 100644 index 0000000..17dfa98 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/algorithm/reduce.hpp @@ -0,0 +1,487 @@ +#pragma once + +#include "../syclflow.hpp" + +namespace tf::detail { + +// ---------------------------------------------------------------------------- +// reduction helper functions +// ---------------------------------------------------------------------------- + +/** @private */ +template<unsigned nt, typename T> +struct syclBlockReduce { + + static const unsigned group_size = std::min(nt, SYCL_WARP_SIZE); + static const unsigned shm_size = std::max(nt, 2* group_size); + static const unsigned num_passes = log2(group_size); + static const unsigned num_items = nt / group_size; + + static_assert( + nt && (0 == nt % SYCL_WARP_SIZE), + "syclBlockReduce requires num threads to be a multiple of warp_size (32)" + ); + + using shm_t = sycl::accessor< + T, 1, sycl::access::mode::read_write, sycl::access::target::local + >; + + template<typename op_t> + T operator()( + sycl::nd_item<1>&, T, const shm_t&, unsigned, op_t, bool = true + ) const; +}; + +// function: reduce to be called from a block +template<unsigned nt, typename T> +template<typename op_t> +T syclBlockReduce<nt, T>::operator ()( + sycl::nd_item<1>& item, + T x, + const shm_t& shm, + unsigned count, + op_t op, + bool ret +) const { + + auto tid = item.get_local_id(0); + + // Store your data into shared memory. + shm[tid] = x; + item.barrier(sycl::access::fence_space::local_space); + + if(tid < group_size) { + // Each thread scans within its lane. + sycl_strided_iterate<group_size, num_items>([&](auto i, auto j) { + if(i > 0) { + x = op(x, shm[j]); + } + }, tid, count); + shm[tid] = x; + } + item.barrier(sycl::access::fence_space::local_space); + + auto count2 = count < group_size ? count : group_size; + auto first = (1 & num_passes) ? group_size : 0; + if(tid < group_size) { + shm[first + tid] = x; + } + item.barrier(sycl::access::fence_space::local_space); + + sycl_iterate<num_passes>([&](auto pass) { + if(tid < group_size) { + if(auto offset = 1 << pass; tid + offset < count2) { + x = op(x, shm[first + offset + tid]); + } + first = group_size - first; + shm[first + tid] = x; + } + item.barrier(sycl::access::fence_space::local_space); + }); + + if(ret) { + x = shm[0]; + item.barrier(sycl::access::fence_space::local_space); + } + return x; +} + +/** @private */ +template <typename P, typename I, typename T, typename O> +sycl::event sycl_reduce_loop( + P&& p, + I input, + unsigned count, + T* res, + O op, + bool incl, + void* ptr, + std::vector<sycl::event> evs +) { + + using E = std::decay_t<P>; + using R = syclBlockReduce<E::nt, T>; + + auto buf = static_cast<T*>(ptr); + auto B = (count + E::nv - 1) / E::nv; + + auto e = p.queue().submit([=, evs=std::move(evs)](sycl::handler& h) { + + h.depends_on(evs); + + // create a shared memory + typename R::shm_t shm(sycl::range<1>(R::shm_size), h); + + h.parallel_for( + sycl::nd_range<1>{sycl::range<1>(B*E::nt), sycl::range<1>(E::nt)}, + [=](sycl::nd_item<1> item) { + + auto tid = item.get_local_id(0); + auto bid = item.get_group(0); + + // get the tile of this group + auto tile = sycl_get_tile(bid, E::nv, count); + + // load data from input to register + auto x = sycl_mem_to_reg_strided<E::nt, E::vt>( + input + tile.begin, tid, tile.count() + ); + // reduce multiple values per thread into a scalar. + T s; + sycl_strided_iterate<E::nt, E::vt>( + [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count() + ); + // reduce to a scalar per block. + s = R()( + item, s, shm, (tile.count()<E::nt ? tile.count() : E::nt), op, false + ); + if(!tid) { + (1 == B) ? *res = (incl ? op(*res, s) : s) : buf[bid] = s; + } + } + ); + }); + + if(B > 1) { + return sycl_reduce_loop(p, buf, B, res, op, incl, buf+B, {e}); + } + else { + return e; + } +} + +} // end of namespace detail ------------------------------------------------- + +namespace tf { + +/** +@brief queries the buffer size in bytes needed to call reduce kernels + +@tparam P execution policy type +@tparam T value type + +@param count number of elements to reduce + +The function is used to allocate a buffer for calling asynchronous reduce. +Please refer to @ref SYCLSTDReduce for details. +*/ +template <typename P, typename T> +unsigned sycl_reduce_buffer_size(unsigned count) { + using E = std::decay_t<P>; + unsigned B = (count + E::nv - 1) / E::nv; + unsigned n = 0; + for(auto b=B; b>1; n += (b=(b+E::nv-1)/E::nv)); + return n*sizeof(T); +} + +//// sycl reduction +//template <typename I, typename T, typename C, bool uninitialized> +//auto syclFlow::_reduce_cgh(I first, I last, T* res, C&& op) { +// +// // TODO: special case N == 0? +// size_t N = std::distance(first, last); +// size_t B = _default_group_size(N); +// +// return [=, op=std::forward<C>(op)](sycl::handler& handler) mutable { +// +// // create a shared memory +// sycl::accessor< +// T, 1, sycl::access::mode::read_write, sycl::access::target::local +// > shm(sycl::range<1>(B), handler); +// +// // perform parallel reduction +// handler.parallel_for( +// sycl::nd_range<1>{sycl::range<1>(B), sycl::range<1>(B)}, +// [=] (sycl::nd_item<1> item) { +// +// size_t tid = item.get_global_id(0); +// +// if(tid >= N) { +// return; +// } +// +// shm[tid] = *(first+tid); +// +// for(size_t i=tid+B; i<N; i+=B) { +// shm[tid] = op(shm[tid], *(first+i)); +// } +// +// item.barrier(sycl::access::fence_space::local_space); +// +// for(size_t s = B / 2; s > 0; s >>= 1) { +// if(tid < s && tid + s < N) { +// shm[tid] = op(shm[tid], shm[tid+s]); +// } +// item.barrier(sycl::access::fence_space::local_space); +// } +// +// if(tid == 0) { +// if constexpr (uninitialized) { +// *res = shm[0]; +// } +// else { +// *res = op(*res, shm[0]); +// } +// } +// }); +// }; +//} + +// ---------------------------------------------------------------------------- +// SYCL standard reduce algorithms +// ---------------------------------------------------------------------------- + +/** +@brief performs parallel reduction over a range of items + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements + +This method is equivalent to the parallel execution of the following loop +on a SYCL device: + +@code{.cpp} +while (first != last) { + *result = op(*result, *first++); +} +@endcode + */ +template<typename P, typename I, typename T, typename O> +void sycl_reduce(P&& p, I first, I last, T* res, O op) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // allocate temporary buffer + auto tmp = sycl::malloc_device( + sycl_reduce_buffer_size<P, T>(count), p.queue() + ); + + // reduction loop + detail::sycl_reduce_loop(p, first, count, res, op, true, tmp, {}).wait(); + + // deallocate the temporary buffer + sycl::free(tmp, p.queue()); +} + +/** +@brief performs asynchronous parallel reduction over a range of items + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements +@param buf pointer to the temporary buffer + +@return an SYCL event + +Please refer to @ref SYCLSTDReduce for details. + */ +template<typename P, typename I, typename T, typename O> +sycl::event sycl_reduce_async( + P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return {}; + } + + // reduction loop + return detail::sycl_reduce_loop( + p, first, count, res, op, true, buf, std::move(dep) + ); +} + +/** +@brief performs parallel reduction over a range of items + without an initial value + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements + +This method is equivalent to the parallel execution of the following loop +on a SYCL device: + +@code{.cpp} +*result = *first++; // no initial values partitipcate in the loop +while (first != last) { + *result = op(*result, *first++); +} +@endcode +*/ +template<typename P, typename I, typename T, typename O> +void sycl_uninitialized_reduce(P&& p, I first, I last, T* res, O op) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return; + } + + // allocate temporary buffer + auto tmp = sycl::malloc_device( + sycl_reduce_buffer_size<P, T>(count), p.queue() + ); + + // reduction loop + detail::sycl_reduce_loop(p, first, count, res, op, false, tmp, {}).wait(); + + // deallocate the temporary buffer + sycl::free(tmp, p.queue()); +} + +/** +@brief performs asynchronous parallel reduction over a range of items + without an initial value + +@tparam P execution policy type +@tparam I input iterator type +@tparam T value type +@tparam O binary operator type + +@param p execution policy +@param first iterator to the beginning of the range +@param last iterator to the end of the range +@param res pointer to the result +@param op binary operator to apply to reduce elements +@param buf pointer to the temporary buffer + +@return an SYCL event + +Please refer to @ref SYCLSTDReduce for details. +*/ +template<typename P, typename I, typename T, typename O> +sycl::event sycl_uninitialized_reduce_async( + P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep +) { + + unsigned count = std::distance(first, last); + + if(count == 0) { + return {}; + } + + // reduction loop + return detail::sycl_reduce_loop( + p, first, count, res, op, false, buf, std::move(dep) + ); +} + +// ---------------------------------------------------------------------------- +// syclFlow reduce +// ---------------------------------------------------------------------------- + +// Function: reduce +template <typename I, typename T, typename C> +syclTask syclFlow::reduce(I first, I last, T* res, C&& op) { + + //return on(_reduce_cgh<I, T, C, false>(first, last, res, std::forward<C>(op))); + + auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>( + std::distance(first, last) + ); + + return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}] + (sycl::queue& queue, std::vector<sycl::event> events) mutable { + syclDefaultExecutionPolicy p(queue); + return sycl_reduce_async( + p, first, last, res, op, buf.get().data(), std::move(events) + ); + }); +} + +// Function: uninitialized_reduce +template <typename I, typename T, typename C> +syclTask syclFlow::uninitialized_reduce(I first, I last, T* res, C&& op) { + //return on(_reduce_cgh<I, T, C, true>(first, last, res, std::forward<C>(op))); + + auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>( + std::distance(first, last) + ); + + return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}] + (sycl::queue& queue, std::vector<sycl::event> events) mutable { + syclDefaultExecutionPolicy p(queue); + return sycl_uninitialized_reduce_async( + p, first, last, res, op, buf.get().data(), std::move(events) + ); + }); + +} + +// ---------------------------------------------------------------------------- +// rebind methods +// ---------------------------------------------------------------------------- + +//// Function: reduce +//template <typename I, typename T, typename C> +//void syclFlow::reduce(syclTask task, I first, I last, T* res, C&& op) { +// //on(task, _reduce_cgh<I, T, C, false>( +// // first, last, res, std::forward<C>(op) +// //)); +// +// auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>( +// std::distance(first, last) +// ); +// +// on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}] +// (sycl::queue& queue, std::vector<sycl::event> events) mutable { +// syclDefaultExecutionPolicy p(queue); +// return sycl_reduce_async( +// p, first, last, res, op, buf.get().data(), std::move(events) +// ); +// }); +//} +// +//// Function: uninitialized_reduce +//template <typename I, typename T, typename C> +//void syclFlow::uninitialized_reduce( +// syclTask task, I first, I last, T* res, C&& op +//) { +// //on(task, _reduce_cgh<I, T, C, true>( +// // first, last, res, std::forward<C>(op) +// //)); +// auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>( +// std::distance(first, last) +// ); +// +// on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}] +// (sycl::queue& queue, std::vector<sycl::event> events) mutable { +// syclDefaultExecutionPolicy p(queue); +// return sycl_uninitialized_reduce_async( +// p, first, last, res, op, buf.get().data(), std::move(events) +// ); +// }); +//} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp b/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp new file mode 100644 index 0000000..e61fa62 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/algorithm/sycl_for_each.hpp @@ -0,0 +1,88 @@ +#pragma once + +#include "../sycl_flow.hpp" + +namespace tf { + +// command group function object of for_each +template <typename I, typename C> +auto syclFlow::_for_each_cgh(I first, I last, C&& op) { + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = _default_group_size(N); + + return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable { + size_t _N = (N % B == 0) ? N : (N + B - N % B); + handler.parallel_for( + sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)}, + [=] (sycl::nd_item<1> item) { + size_t i = item.get_global_id(0); + if(i < N) { + op(*(first + i)); + } + } + ); + }; +} + +// command group function object of for_each_index +template <typename I, typename C> +auto syclFlow::_for_each_index_cgh(I first, I last, I step, C&& op) { + + if(is_range_invalid(first, last, step)) { + TF_THROW("invalid range [", first, ", ", last, ") with step size ", step); + } + + // TODO: special case when N is 0? + size_t N = distance(first, last, step); + size_t B = _default_group_size(N); + + return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable { + size_t _N = (N % B == 0) ? N : (N + B - N % B); + handler.parallel_for( + sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)}, + [=] (sycl::nd_item<1> item) { + size_t i = item.get_global_id(0); + if(i < N) { + op(static_cast<I>(i)*step + first); + } + } + ); + }; +} + +// ---------------------------------------------------------------------------- +// for_each and for_each_index algorithms +// ---------------------------------------------------------------------------- + +// Function: for_each +template <typename I, typename C> +syclTask syclFlow::for_each(I first, I last, C&& op) { + return on(_for_each_cgh(first, last, std::forward<C>(op))); +} + +// Function: for_each_index +template <typename I, typename C> +syclTask syclFlow::for_each_index(I beg, I end, I inc, C&& op) { + return on(_for_each_index_cgh(beg, end, inc, std::forward<C>(op))); +} + +// ---------------------------------------------------------------------------- +// rebind +// ---------------------------------------------------------------------------- + +// Function: for_each +template <typename I, typename C> +void syclFlow::for_each(syclTask task, I first, I last, C&& op) { + on(task, _for_each_cgh(first, last, std::forward<C>(op))); +} + +// Function: for_each_index +template <typename I, typename C> +void syclFlow::for_each_index(syclTask task, I beg, I end, I inc, C&& op) { + on(task, _for_each_index_cgh(beg, end, inc, std::forward<C>(op))); +} + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp b/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp new file mode 100644 index 0000000..b4372e2 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/algorithm/sycl_transform.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "../sycl_flow.hpp" + +namespace tf { + +// Function: _transform_cgh +template <typename I, typename C, typename... S> +auto syclFlow::_transform_cgh(I first, I last, C&& op, S... srcs) { + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = _default_group_size(N); + + return [=, op=std::forward<C>(op)] (sycl::handler& handler) mutable { + + size_t _N = (N % B == 0) ? N : (N + B - N % B); + + handler.parallel_for( + sycl::nd_range<1>{sycl::range<1>(_N), sycl::range<1>(B)}, + [=] (sycl::nd_item<1> item) { + size_t i = item.get_global_id(0); + if(i < N) { + *(first + i) = op(*(srcs + i)...); + } + } + ); + }; +} + +// Function: transform +template <typename I, typename C, typename... S> +syclTask syclFlow::transform(I first, I last, C&& op, S... srcs) { + return on(_transform_cgh(first, last, std::forward<C>(op), srcs...)); +} + +// Procedure: transform +template <typename I, typename C, typename... S> +void syclFlow::transform( + syclTask task, I first, I last, C&& op, S... srcs +) { + on(task, _transform_cgh(first, last, std::forward<C>(op), srcs...)); +} + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp b/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp new file mode 100644 index 0000000..ceee08a --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/sycl_execution_policy.hpp @@ -0,0 +1,70 @@ +#pragma once + +/** +@file sycl_execution_policy.hpp +@brief SYCL execution policy include file +*/ + +namespace tf { + +/** +@class syclExecutionPolicy + +@brief class to define execution policy for SYCL standard algorithms + +@tparam NT number of threads per block +@tparam VT number of work units per thread + +Execution policy configures the kernel execution parameters in SYCL algorithms. +The first template argument, @c NT, the number of threads per block should +always be a power-of-two number. +The second template argument, @c VT, the number of work units per thread +is recommended to be an odd number to avoid bank conflict. + +Details can be referred to @ref SYCLSTDExecutionPolicy. +*/ +template<unsigned NT, unsigned VT> +class syclExecutionPolicy { + + static_assert(is_pow2(NT), "max # threads per block must be a power of two"); + + public: + + /** @brief static constant for getting the number of threads per block */ + const static unsigned nt = NT; + + /** @brief static constant for getting the number of work units per thread */ + const static unsigned vt = VT; + + /** @brief static constant for getting the number of elements to process per block */ + const static unsigned nv = NT*VT; + + /** + @brief constructs an execution policy object with the given queue + */ + syclExecutionPolicy(sycl::queue& queue) : _queue{queue} {} + + /** + @brief returns an mutable reference to the associated queue + */ + sycl::queue& queue() noexcept { return _queue; }; + + /** + @brief returns an immutable reference to the associated queue + */ + const sycl::queue& queue() const noexcept { return _queue; } + + private: + + sycl::queue& _queue; +}; + +/** +@brief default execution policy + */ +using syclDefaultExecutionPolicy = syclExecutionPolicy<512, 9>; + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/sycl/sycl_graph.hpp b/myxpcs/include/taskflow_/sycl/sycl_graph.hpp new file mode 100644 index 0000000..3a6f786 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/sycl_graph.hpp @@ -0,0 +1,255 @@ +#pragma once + +#include <CL/sycl.hpp> + +#include "sycl_meta.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// syclGraph class +// ---------------------------------------------------------------------------- + +// class: syclGraph +class syclGraph : public CustomGraphBase { + + friend class syclNode; + friend class syclTask; + friend class syclFlow; + friend class Taskflow; + friend class Executor; + + constexpr static int OFFLOADED = 0x01; + constexpr static int TOPOLOGY_CHANGED = 0x02; + + public: + + syclGraph() = default; + ~syclGraph() = default; + + syclGraph(const syclGraph&) = delete; + syclGraph(syclGraph&&); + + syclGraph& operator = (const syclGraph&) = delete; + syclGraph& operator = (syclGraph&&); + + template <typename... ArgsT> + syclNode* emplace_back(ArgsT&&...); + + bool empty() const; + + void clear(); + void dump(std::ostream&, const void*, const std::string&) const override final; + + private: + + int _state {0}; + + std::vector<std::unique_ptr<syclNode>> _nodes; +}; + +// ---------------------------------------------------------------------------- +// syclNode definitions +// ---------------------------------------------------------------------------- + +// class: syclNode +class syclNode { + + friend class syclGraph; + friend class syclTask; + friend class syclFlow; + friend class Taskflow; + friend class Executor; + + struct Empty { + }; + + struct CGH { + + std::function<void(sycl::handler&)> work; + + template <typename F> + CGH(F&& func) : work {std::forward<F>(func)} {} + }; + + using handle_t = std::variant< + Empty, + CGH + >; + + public: + + // variant index + constexpr static auto EMPTY = get_index_v<Empty, handle_t>; + constexpr static auto COMMAND_GROUP_HANDLER = get_index_v<CGH, handle_t>; + + syclNode() = delete; + + template <typename... ArgsT> + syclNode(syclGraph&, ArgsT&&...); + + private: + + syclGraph& _graph; + + std::string _name; + + int _level; + + sycl::event _event; + + handle_t _handle; + + SmallVector<syclNode*> _successors; + SmallVector<syclNode*> _dependents; + + void _precede(syclNode*); +}; + +// ---------------------------------------------------------------------------- +// syclNode definitions +// ---------------------------------------------------------------------------- + +// Constructor +template <typename... ArgsT> +syclNode::syclNode(syclGraph& g, ArgsT&&... args) : + _graph {g}, + _handle {std::forward<ArgsT>(args)...} { +} + +// Procedure: _precede +inline void syclNode::_precede(syclNode* v) { + _graph._state |= syclGraph::TOPOLOGY_CHANGED; + _successors.push_back(v); + v->_dependents.push_back(this); +} + +// ---------------------------------------------------------------------------- +// syclGraph definitions +// ---------------------------------------------------------------------------- + +// Move constructor +inline syclGraph::syclGraph(syclGraph&& g) : + _nodes {std::move(g._nodes)} { + + assert(g._nodes.empty()); +} + +// Move assignment +inline syclGraph& syclGraph::operator = (syclGraph&& rhs) { + + // lhs + _nodes = std::move(rhs._nodes); + + assert(rhs._nodes.empty()); + + return *this; +} + +// Function: empty +inline bool syclGraph::empty() const { + return _nodes.empty(); +} + +// Procedure: clear +inline void syclGraph::clear() { + _state = syclGraph::TOPOLOGY_CHANGED; + _nodes.clear(); +} + +// Function: emplace_back +template <typename... ArgsT> +syclNode* syclGraph::emplace_back(ArgsT&&... args) { + + _state |= syclGraph::TOPOLOGY_CHANGED; + + auto node = std::make_unique<syclNode>(std::forward<ArgsT>(args)...); + _nodes.emplace_back(std::move(node)); + return _nodes.back().get(); + + // TODO: object pool + + //auto node = new syclNode(std::forward<ArgsT>(args)...); + //_nodes.push_back(node); + //return node; +} + +// Procedure: dump the graph to a DOT format +inline void syclGraph::dump( + std::ostream& os, const void* root, const std::string& root_name +) const { + + // recursive dump with stack + std::stack<std::tuple<const syclGraph*, const syclNode*, int>> stack; + stack.push(std::make_tuple(this, nullptr, 1)); + + int pl = 0; + + while(!stack.empty()) { + + auto [graph, parent, l] = stack.top(); + stack.pop(); + + for(int i=0; i<pl-l+1; i++) { + os << "}\n"; + } + + if(parent == nullptr) { + if(root) { + os << "subgraph cluster_p" << root << " {\nlabel=\"syclFlow: "; + if(root_name.empty()) os << 'p' << root; + else os << root_name; + os << "\";\n" << "color=\"red\"\n"; + } + else { + os << "digraph syclFlow {\n"; + } + } + else { + os << "subgraph cluster_p" << parent << " {\nlabel=\"syclSubflow: "; + if(parent->_name.empty()) os << 'p' << parent; + else os << parent->_name; + os << "\";\n" << "color=\"purple\"\n"; + } + + for(auto& v : graph->_nodes) { + + os << 'p' << v.get() << "[label=\""; + if(v->_name.empty()) { + os << 'p' << v.get() << "\""; + } + else { + os << v->_name << "\""; + } + os << "];\n"; + + for(const auto s : v->_successors) { + os << 'p' << v.get() << " -> " << 'p' << s << ";\n"; + } + + if(v->_successors.size() == 0) { + if(parent == nullptr) { + if(root) { + os << 'p' << v.get() << " -> p" << root << ";\n"; + } + } + else { + os << 'p' << v.get() << " -> p" << parent << ";\n"; + } + } + } + + // set the previous level + pl = l; + } + + for(int i=0; i<pl; i++) { + os << "}\n"; + } + +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/sycl/sycl_meta.hpp b/myxpcs/include/taskflow_/sycl/sycl_meta.hpp new file mode 100644 index 0000000..b3c4af1 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/sycl_meta.hpp @@ -0,0 +1,517 @@ +#pragma once + +#include "sycl_execution_policy.hpp" + +namespace tf { + +// default warp size +inline constexpr unsigned SYCL_WARP_SIZE = 32; + +// empty type +struct syclEmpty { }; + +// ---------------------------------------------------------------------------- +// iterator unrolling +// ---------------------------------------------------------------------------- + +// Template unrolled looping construct. +template<unsigned i, unsigned count, bool valid = (i < count)> +struct syclIterate { + template<typename F> + static void eval(F f) { + f(i); + syclIterate<i + 1, count>::eval(f); + } +}; + +template<unsigned i, unsigned count> +struct syclIterate<i, count, false> { + template<typename F> + static void eval(F) { } +}; + +template<unsigned begin, unsigned end, typename F> +void sycl_iterate(F f) { + syclIterate<begin, end>::eval(f); +} + +template<unsigned count, typename F> +void sycl_iterate(F f) { + sycl_iterate<0, count>(f); +} + +template<unsigned count, typename T> +T reduce(const T(&x)[count]) { + T y; + sycl_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; }); + return y; +} + +template<unsigned count, typename T> +void fill(T(&x)[count], T val) { + sycl_iterate<count>([&](auto i) { x[i] = val; }); +} + +// Invoke unconditionally. +template<unsigned nt, unsigned vt, typename F> +void sycl_strided_iterate(F f, unsigned tid) { + sycl_iterate<vt>([=](auto i) { f(i, nt * i + tid); }); +} + +// Check range. +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F> +void sycl_strided_iterate(F f, unsigned tid, unsigned count) { + // Unroll the first vt0 elements of each thread. + if(vt0 > 1 && count >= nt * vt0) { + sycl_strided_iterate<nt, vt0>(f, tid); // No checking + } else { + sycl_iterate<vt0>([=](auto i) { + auto j = nt * i + tid; + if(j < count) f(i, j); + }); + } + + // TODO: seems dummy when vt0 == vt + sycl_iterate<vt0, vt>([=](auto i) { + auto j = nt * i + tid; + if(j < count) f(i, j); + }); +} + +template<unsigned vt, typename F> +void sycl_thread_iterate(F f, unsigned tid) { + sycl_iterate<vt>([=](auto i) { f(i, vt * tid + i); }); +} + +// ---------------------------------------------------------------------------- +// syclRange +// ---------------------------------------------------------------------------- + +// syclRange +struct syclRange { + unsigned begin, end; + unsigned size() const { return end - begin; } + unsigned count() const { return size(); } + bool valid() const { return end > begin; } +}; + +inline syclRange sycl_get_tile(unsigned b, unsigned nv, unsigned count) { + return syclRange { nv * b, std::min(count, nv * (b + 1)) }; +} + + +// ---------------------------------------------------------------------------- +// syclArray +// ---------------------------------------------------------------------------- + +template<typename T, unsigned size> +struct syclArray { + T data[size]; + + T operator[](unsigned i) const { return data[i]; } + T& operator[](unsigned i) { return data[i]; } + + syclArray() = default; + syclArray(const syclArray&) = default; + syclArray& operator=(const syclArray&) = default; + + // Fill the array with x. + syclArray(T x) { + sycl_iterate<size>([&](unsigned i) { data[i] = x; }); + } +}; + +template<typename T> +struct syclArray<T, 0> { + T operator[](unsigned) const { return T(); } + T& operator[](unsigned) { return *(T*)nullptr; } +}; + +template<typename T, typename V, unsigned size> +struct syclKVArray { + syclArray<T, size> keys; + syclArray<V, size> vals; +}; + +// ---------------------------------------------------------------------------- +// thread reg <-> global mem +// ---------------------------------------------------------------------------- + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I> +auto sycl_mem_to_reg_strided(I mem, unsigned tid, unsigned count) { + using T = typename std::iterator_traits<I>::value_type; + syclArray<T, vt> x; + sycl_strided_iterate<nt, vt, vt0>( + [&](auto i, auto j) { x[i] = mem[j]; }, tid, count + ); + return x; +} + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t> +void sycl_reg_to_mem_strided( + syclArray<T, vt> x, unsigned tid, unsigned count, it_t mem) { + + sycl_strided_iterate<nt, vt, vt0>( + [=](auto i, auto j) { mem[j] = x[i]; }, tid, count + ); +} + +template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O> +auto sycl_transform_mem_to_reg_strided( + I mem, unsigned tid, unsigned count, O op +) { + using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>; + syclArray<T, vt> x; + sycl_strided_iterate<nt, vt, vt0>( + [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count + ); + return x; +} + +// ---------------------------------------------------------------------------- +// thread reg <-> shared +// ---------------------------------------------------------------------------- + +//template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +//void sycl_reg_to_shared_thread( +// syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true +//) { +// +// static_assert(shared_size >= nt * vt, +// "reg_to_shared_thread must have at least nt * vt storage"); +// +// sycl_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid); +// +// if(sync) __syncthreads(); +//} +// +//template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +//auto sycl_shared_to_reg_thread( +// const T (&shared)[shared_size], unsigned tid, bool sync = true +//) { +// +// static_assert(shared_size >= nt * vt, +// "reg_to_shared_thread must have at least nt * vt storage"); +// +// syclArray<T, vt> x; +// sycl_thread_iterate<vt>([&](auto i, auto j) { +// x[i] = shared[j]; +// }, tid); +// +// if(sync) __syncthreads(); +// +// return x; +//} +// +//template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +//void sycl_reg_to_shared_strided( +// syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true +//) { +// +// static_assert(shared_size >= nt * vt, +// "reg_to_shared_strided must have at least nt * vt storage"); +// +// sycl_strided_iterate<nt, vt>( +// [&](auto i, auto j) { shared[j] = x[i]; }, tid +// ); +// +// if(sync) __syncthreads(); +//} +// +//template<unsigned nt, unsigned vt, typename T, unsigned shared_size> +//auto sycl_shared_to_reg_strided( +// const T (&shared)[shared_size], unsigned tid, bool sync = true +//) { +// +// static_assert(shared_size >= nt * vt, +// "shared_to_reg_strided must have at least nt * vt storage"); +// +// syclArray<T, vt> x; +// sycl_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid); +// if(sync) __syncthreads(); +// +// return x; +//} +// +//template< +// unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, +// unsigned shared_size +//> +//auto sycl_reg_to_mem_thread( +// syclArray<T, vt> x, unsigned tid, +// unsigned count, it_t mem, T (&shared)[shared_size] +//) { +// sycl_reg_to_shared_thread<nt>(x, tid, shared); +// auto y = sycl_shared_to_reg_strided<nt, vt>(shared, tid); +// sycl_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem); +//} +// +//template< +// unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, +// unsigned shared_size +//> +//auto sycl_mem_to_reg_thread( +// it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size] +//) { +// +// auto x = sycl_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count); +// sycl_reg_to_shared_strided<nt, vt>(x, tid, shared); +// auto y = sycl_shared_to_reg_thread<nt, vt>(shared, tid); +// return y; +//} +// +//template<unsigned nt, unsigned vt, typename T, unsigned S> +//auto sycl_shared_gather( +// const T(&data)[S], syclArray<unsigned, vt> indices, bool sync = true +//) { +// +// static_assert(S >= nt * vt, +// "shared_gather must have at least nt * vt storage"); +// +// syclArray<T, vt> x; +// sycl_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; }); +// +// if(sync) __syncthreads(); +// +// return x; +//} +// +// +// +//// ---------------------------------------------------------------------------- +//// reg<->reg +//// ---------------------------------------------------------------------------- +// +//template<unsigned nt, unsigned vt, typename T, unsigned S> +//auto sycl_reg_thread_to_strided( +// syclArray<T, vt> x, unsigned tid, T (&shared)[S] +//) { +// sycl_reg_to_shared_thread<nt>(x, tid, shared); +// return sycl_shared_to_reg_strided<nt, vt>(shared, tid); +//} +// +//template<unsigned nt, unsigned vt, typename T, unsigned S> +//auto sycl_reg_strided_to_thread( +// syclArray<T, vt> x, unsigned tid, T (&shared)[S] +//) { +// sycl_reg_to_shared_strided<nt>(x, tid, shared); +// return sycl_shared_to_reg_thread<nt, vt>(shared, tid); +//} + +// ---------------------------------------------------------------------------- +// syclLoadStoreIterator +// ---------------------------------------------------------------------------- + +template<typename L, typename S, typename T, typename I> +struct syclLoadStoreIterator : std::iterator_traits<const T*> { + + L load; + S store; + I base; + + syclLoadStoreIterator(L load_, S store_, I base_) : + load(load_), store(store_), base(base_) { } + + struct assign_t { + L load; + S store; + I index; + + assign_t& operator=(T rhs) { + static_assert(!std::is_same<S, syclEmpty>::value, + "load_iterator is being stored to."); + store(rhs, index); + return *this; + } + operator T() const { + static_assert(!std::is_same<L, syclEmpty>::value, + "store_iterator is being loaded from."); + return load(index); + } + }; + + assign_t operator[](I index) const { + return assign_t { load, store, base + index }; + } + assign_t operator*() const { + return assign_t { load, store, base }; + } + + syclLoadStoreIterator operator+(I offset) const { + syclLoadStoreIterator cp = *this; + cp += offset; + return cp; + } + + syclLoadStoreIterator& operator+=(I offset) { + base += offset; + return *this; + } + + syclLoadStoreIterator operator-(I offset) const { + syclLoadStoreIterator cp = *this; + cp -= offset; + return cp; + } + + syclLoadStoreIterator& operator-=(I offset) { + base -= offset; + return *this; + } +}; + +//template<typename T> +//struct trivial_load_functor { +// template<typename I> +// T operator()(I index) const { +// return T(); +// } +//}; + +//template<typename T> +//struct trivial_store_functor { +// template<typename I> +// void operator()(T v, I index) const { } +//}; + +template <typename T, typename I = int, typename L, typename S> +auto sycl_make_load_store_iterator(L load, S store, I base = 0) { + return syclLoadStoreIterator<L, S, T, I>(load, store, base); +} + +template <typename T, typename I = int, typename L> +auto sycl_make_load_iterator(L load, I base = 0) { + return sycl_make_load_store_iterator<T>(load, syclEmpty(), base); +} + +template <typename T, typename I = int, typename S> +auto sycl_make_store_iterator(S store, I base = 0) { + return sycl_make_load_store_iterator<T>(syclEmpty(), store, base); +} + +// ---------------------------------------------------------------------------- +// swap +// ---------------------------------------------------------------------------- + +template<typename T> +void sycl_swap(T& a, T& b) { + auto c = a; + a = b; + b = c; +} + +// ---------------------------------------------------------------------------- +// launch kernel +// ---------------------------------------------------------------------------- + +//template<typename F, typename... args_t> +//__global__ void sycl_kernel(F f, args_t... args) { +// f(threadIdx.x, blockIdx.x, args...); +//} + +// ---------------------------------------------------------------------------- +// operators +// ---------------------------------------------------------------------------- + +template <typename T> +struct sycl_plus : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a + b; } +}; + +template <typename T> +struct sycl_minus : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a - b; } +}; + +template <typename T> +struct sycl_multiplies : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a * b; } +}; + +template <typename T> +struct sycl_maximum : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a > b ? a : b; } +}; + +template <typename T> +struct sycl_minimum : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a < b ? a : b; } +}; + +template <typename T> +struct sycl_less : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a < b; } +}; + +template <typename T> +struct sycl_greater : public std::binary_function<T, T, T> { + T operator()(T a, T b) const { return a > b; } +}; + +// ---------------------------------------------------------------------------- +// Memory Object +// ---------------------------------------------------------------------------- + +/** +@private +*/ +template <typename T> +class syclScopedDeviceMemory { + + public: + + syclScopedDeviceMemory() = delete; + + syclScopedDeviceMemory(size_t N, sycl::queue& queue) : + _queue {queue}, + _N {N} { + if(N) { + _data = sycl::malloc_device<T>(N, _queue); + } + } + + syclScopedDeviceMemory(syclScopedDeviceMemory&& rhs) : + _queue{std::move(rhs._queue)}, _data{rhs._data}, _N {rhs._N} { + rhs._data = nullptr; + rhs._N = 0; + } + + ~syclScopedDeviceMemory() { + if(_data) { + sycl::free(_data, _queue); + } + } + + syclScopedDeviceMemory& operator = (syclScopedDeviceMemory&& rhs) { + if(_data) { + sycl::free(_data, _queue); + } + _queue = std::move(rhs._queue); + _data = rhs._data; + _N = rhs._N; + rhs._data = nullptr; + rhs._N = 0; + return *this; + } + + size_t size() const { return _N; } + + T* data() { return _data; } + const T* data() const { return _data; } + + syclScopedDeviceMemory(const syclScopedDeviceMemory&) = delete; + syclScopedDeviceMemory& operator = (const syclScopedDeviceMemory&) = delete; + + private: + + sycl::queue& _queue; + + T* _data {nullptr}; + size_t _N {0}; +}; + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/sycl/sycl_task.hpp b/myxpcs/include/taskflow_/sycl/sycl_task.hpp new file mode 100644 index 0000000..ed83ef4 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/sycl_task.hpp @@ -0,0 +1,209 @@ +#pragma once + +#include "sycl_graph.hpp" + +/** +@file sycl_task.hpp +@brief syclTask include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// syclTask +// ---------------------------------------------------------------------------- + +/** +@class syclTask + +@brief handle to a node of the internal CUDA graph +*/ +class syclTask { + + friend class syclFlow; + + friend std::ostream& operator << (std::ostream&, const syclTask&); + + public: + + /** + @brief constructs an empty syclTask + */ + syclTask() = default; + + /** + @brief copy-constructs a syclTask + */ + syclTask(const syclTask&) = default; + + /** + @brief copy-assigns a syclTask + */ + syclTask& operator = (const syclTask&) = default; + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + syclTask& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template <typename... Ts> + syclTask& succeed(Ts&&... tasks); + + /** + @brief assigns a name to the task + + @param name a @std_string acceptable string + + @return @c *this + */ + syclTask& name(const std::string& name); + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors + */ + size_t num_successors() const; + + /** + @brief queries the number of dependents + */ + size_t num_dependents() const; + + /** + @brief queries if the task is associated with a syclNode + */ + bool empty() const; + + /** + @brief dumps the task through an output stream + + @tparam T output stream type with insertion operator (<<) defined + @param ostream an output stream target + */ + template <typename T> + void dump(T& ostream) const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template <typename V> + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template <typename V> + void for_each_dependent(V&& visitor) const; + + private: + + syclTask(syclNode*); + + syclNode* _node {nullptr}; +}; + +// Constructor +inline syclTask::syclTask(syclNode* node) : _node {node} { +} + +// Function: precede +template <typename... Ts> +syclTask& syclTask::precede(Ts&&... tasks) { + (_node->_precede(tasks._node), ...); + return *this; +} + +// Function: succeed +template <typename... Ts> +syclTask& syclTask::succeed(Ts&&... tasks) { + (tasks._node->_precede(_node), ...); + return *this; +} + +// Function: empty +inline bool syclTask::empty() const { + return _node == nullptr; +} + +// Function: name +inline syclTask& syclTask::name(const std::string& name) { + _node->_name = name; + return *this; +} + +// Function: name +inline const std::string& syclTask::name() const { + return _node->_name; +} + +// Function: num_successors +inline size_t syclTask::num_successors() const { + return _node->_successors.size(); +} + +// Function: num_dependents +inline size_t syclTask::num_dependents() const { + return _node->_dependents.size(); +} + +// Procedure: dump +template <typename T> +void syclTask::dump(T& os) const { + os << "syclTask "; + if(_node->_name.empty()) os << _node; + else os << _node->_name; +} + +// Function: for_each_successor +template <typename V> +void syclTask::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node->_successors.size(); ++i) { + visitor(syclTask(_node->_successors[i])); + } +} + +// Function: for_each_dependent +template <typename V> +void syclTask::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node->_dependents.size(); ++i) { + visitor(syclTask(_node->_dependents[i])); + } +} + + +// ---------------------------------------------------------------------------- +// global ostream +// ---------------------------------------------------------------------------- + +/** +@brief overload of ostream inserter operator for syclTask +*/ +inline std::ostream& operator << (std::ostream& os, const syclTask& ct) { + ct.dump(os); + return os; +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/sycl/syclflow.hpp b/myxpcs/include/taskflow_/sycl/syclflow.hpp new file mode 100644 index 0000000..a2a0976 --- /dev/null +++ b/myxpcs/include/taskflow_/sycl/syclflow.hpp @@ -0,0 +1,684 @@ +#pragma once + +#include "../taskflow.hpp" +#include "sycl_task.hpp" + +/** +@file syclflow.hpp +@brief main syclFlow include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// class definition: syclFlow +// ---------------------------------------------------------------------------- + +/** +@class syclFlow + +@brief class for building a SYCL task dependency graph + +*/ +class syclFlow { + + friend class Executor; + + struct External { + syclGraph graph; + }; + + struct Internal { + Executor& executor; + Internal(Executor& e) : executor {e} {} + }; + + using handle_t = std::variant<External, Internal>; + + public: + + /** + @brief constructs a standalone %syclFlow from the given queue + + A standalone %syclFlow does not go through any taskflow and + can be run by the caller thread using explicit offload methods + (e.g., tf::syclFlow::offload). + */ + syclFlow(sycl::queue& queue); + + /** + @brief destroys the %syclFlow + */ + ~syclFlow() = default; + + /** + @brief queries the emptiness of the graph + */ + bool empty() const; + + /** + @brief queries the number of tasks + */ + size_t num_tasks() const; + + /** + @brief dumps the %syclFlow graph into a DOT format through an + output stream + */ + void dump(std::ostream& os) const; + + /** + @brief clear the associated graph + */ + void clear(); + + // ------------------------------------------------------------------------ + // Generic device operations + // ------------------------------------------------------------------------ + + /** + @brief creates a task that launches the given command group function object + + @tparam F type of command group function object + @param func function object that is constructible from + std::function<void(sycl::handler&)> + + Creates a task that is associated from the given command group. + In SYCL, each command group function object is given a unique + command group handler object to perform all the necessary work + required to correctly process data on a device using a kernel. + */ + template <typename F, std::enable_if_t< + std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr + > + syclTask on(F&& func); + + /** + @brief updates the task to the given command group function object + + Similar to tf::syclFlow::on but operates on an existing task. + */ + template <typename F, std::enable_if_t< + std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr + > + void on(syclTask task, F&& func); + + /** + @brief creates a memcpy task that copies untyped data in bytes + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param bytes bytes to copy + + @return a tf::syclTask handle + + A memcpy task transfers @c bytes of data from a source locationA @c src + to a target location @c tgt. Both @c src and @c tgt may be either host + or USM pointers. + */ + syclTask memcpy(void* tgt, const void* src, size_t bytes); + + /** + @brief creates a memset task that fills untyped data with a byte value + + @param ptr pointer to the destination device memory area + @param value value to set for each byte of specified memory + @param bytes number of bytes to set + + @return a tf::syclTask handle + + Fills @c bytes of memory beginning at address @c ptr with @c value. + @c ptr must be a USM allocation. + @c value is interpreted as an unsigned char. + */ + syclTask memset(void* ptr, int value, size_t bytes); + + /** + @brief creates a fill task that fills typed data with the given value + + @tparam T trivially copyable value type + + @param ptr pointer to the memory to fill + @param pattern pattern value to fill into the memory + @param count number of items to fill the value + + Creates a task that fills the specified memory with the + specified value. + */ + template <typename T> + syclTask fill(void* ptr, const T& pattern, size_t count); + + /** + @brief creates a copy task that copies typed data from a source to a target + memory block + + @tparam T trivially copyable value type + + @param target pointer to the memory to fill + @param source pointer to the pattern value to fill into the memory + @param count number of items to fill the value + + Creates a task that copies @c count items of type @c T from a source memory + location to a target memory location. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + syclTask copy(T* target, const T* source, size_t count); + + /** + @brief creates a kernel task + + @tparam ArgsT arguments types + + @param args arguments to forward to the parallel_for methods defined + in the handler object + + Creates a kernel task from a parallel_for method through the handler + object associated with a command group. + */ + template <typename...ArgsT> + syclTask parallel_for(ArgsT&&... args); + + // ------------------------------------------------------------------------ + // algorithms + // ------------------------------------------------------------------------ + + /** + @brief invokes a SYCL kernel function using only one thread + + @tparam F kernel function type + @param func kernel function + + Creates a task that launches the given function object using only one + kernel thread. + */ + template <typename F> + syclTask single_task(F&& func); + + /** + @brief applies a callable to each dereferenced element of the data array + + @tparam I iterator type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + + @return a tf::syclTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + for(auto itr = first; itr != last; itr++) { + callable(*itr); + } + @endcode + */ + template <typename I, typename C> + syclTask for_each(I first, I last, C&& callable); + + /** + @brief applies a callable to each index in the range with the step size + + @tparam I index type + @tparam C callable type + + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array + + @return a tf::syclTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + // step is positive [first, last) + for(auto i=first; i<last; i+=step) { + callable(i); + } + + // step is negative [first, last) + for(auto i=first; i>last; i+=step) { + callable(i); + } + @endcode + */ + template <typename I, typename C> + syclTask for_each_index(I first, I last, I step, C&& callable); + + /** + @brief applies a callable to a source range and stores the result in a target range + + @tparam I iterator type + @tparam C callable type + @tparam S source types + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable the callable to apply to each element in the range + @param srcs iterators to the source ranges + + @return a tf::syclTask handle + + This method is equivalent to the parallel execution of the following + loop on a SYCL device: + + @code{.cpp} + while (first != last) { + *first++ = callable(*src1++, *src2++, *src3++, ...); + } + @endcode + */ + template <typename I, typename C, typename... S> + syclTask transform(I first, I last, C&& callable, S... srcs); + + /** + @brief performs parallel reduction over a range of items + + @tparam I input iterator type + @tparam T value type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param result pointer to the result with an initialized value + @param op binary reduction operator + + @return a tf::syclTask handle + + This method is equivalent to the parallel execution of the following loop + on a SYCL device: + + @code{.cpp} + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template <typename I, typename T, typename C> + syclTask reduce(I first, I last, T* result, C&& op); + + /** + @brief similar to tf::syclFlow::reduce but does not assume any initial + value to reduce + + This method is equivalent to the parallel execution of the following loop + on a SYCL device: + + @code{.cpp} + *result = *first++; // no initial values partitipcate in the loop + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template <typename I, typename T, typename C> + syclTask uninitialized_reduce(I first, I last, T* result, C&& op); + + // ------------------------------------------------------------------------ + // offload methods + // ------------------------------------------------------------------------ + + /** + @brief offloads the %syclFlow onto a GPU and repeatedly runs it until + the predicate becomes true + + @tparam P predicate type (a binary callable) + + @param predicate a binary predicate (returns @c true for stop) + + Repetitively executes the present %syclFlow through the given queue object + until the predicate returns @c true. + + By default, if users do not offload the %syclFlow, + the executor will offload it once. + */ + template <typename P> + void offload_until(P&& predicate); + + /** + @brief offloads the %syclFlow and executes it by the given times + + @param N number of executions + */ + void offload_n(size_t N); + + /** + @brief offloads the %syclFlow and executes it once + */ + void offload(); + + // ------------------------------------------------------------------------ + // update methods + // ------------------------------------------------------------------------ + + + /** + @brief rebinds the task to a memcpy task + + Similar to tf::syclFlow::memcpy but operates on an existing task. + */ + void memcpy(syclTask task, void* tgt, const void* src, size_t bytes); + + /** + @brief rebinds the task to a memset task + + Similar to tf::syclFlow::memset but operates on an existing task. + */ + void memset(syclTask task, void* ptr, int value, size_t bytes); + + /** + @brief rebinds the task to a fill task + + Similar to tf::syclFlow::fill but operates on an existing task. + */ + template <typename T> + void fill(syclTask task, void* ptr, const T& pattern, size_t count); + + /** + @brief rebinds the task to a copy task + + Similar to tf::syclFlow::copy but operates on an existing task. + */ + template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr + > + void copy(syclTask task, T* target, const T* source, size_t count); + + /** + @brief rebinds the task to a parallel-for kernel task + + Similar to tf::syclFlow::parallel_for but operates on an existing task. + */ + template <typename...ArgsT> + void parallel_for(syclTask task, ArgsT&&... args); + + /** + @brief rebinds the task to a single-threaded kernel task + + Similar to tf::syclFlow::single_task but operates on an existing task. + */ + template <typename F> + void single_task(syclTask task, F&& func); + + private: + + syclFlow(Executor&, syclGraph&, sycl::queue&); + + sycl::queue& _queue; + + handle_t _handle; + + syclGraph& _graph; + + std::vector<syclNode*> _tpg; + std::queue<syclNode*> _bfs; +}; + +// constructor +inline syclFlow::syclFlow(sycl::queue& queue) : + _queue {queue}, + _handle {std::in_place_type_t<External>{}}, + _graph {std::get_if<External>(&_handle)->graph} { +} + +// Construct the syclFlow from executor (internal graph) +inline syclFlow::syclFlow(Executor& e, syclGraph& g, sycl::queue& queue) : + _queue {queue}, + _handle {std::in_place_type_t<Internal>{}, e}, + _graph {g} { +} + +// Function: empty +inline bool syclFlow::empty() const { + return _graph._nodes.empty(); +} + +// Function: num_tasks +inline size_t syclFlow::num_tasks() const { + return _graph._nodes.size(); +} + +// Procedure: dump +inline void syclFlow::dump(std::ostream& os) const { + _graph.dump(os, nullptr, ""); +} + +// Procedure: clear +inline void syclFlow::clear() { + _graph.clear(); +} + +// Function: memcpy +inline syclTask syclFlow::memcpy(void* tgt, const void* src, size_t bytes) { + return on([=](sycl::handler& h){ h.memcpy(tgt, src, bytes); }); +} + +// Function: memset +inline syclTask syclFlow::memset(void* ptr, int value, size_t bytes) { + return on([=](sycl::handler& h){ h.memset(ptr, value, bytes); }); +} + +// Function: fill +template <typename T> +syclTask syclFlow::fill(void* ptr, const T& pattern, size_t count) { + return on([=](sycl::handler& h){ h.fill(ptr, pattern, count); }); +} + +// Function: copy +template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* +> +syclTask syclFlow::copy(T* target, const T* source, size_t count) { + return on([=](sycl::handler& h){ h.memcpy(target, source, count*sizeof(T)); }); +} + +// Function: on +template <typename F, std::enable_if_t< + std::is_invocable_r_v<void, F, sycl::handler&>, void>* +> +syclTask syclFlow::on(F&& f) { + auto node = _graph.emplace_back(_graph, + std::in_place_type_t<syclNode::CGH>{}, std::forward<F>(f) + ); + return syclTask(node); +} + +// Function: single_task +template <typename F> +syclTask syclFlow::single_task(F&& func) { + return on([f=std::forward<F>(func)] (sycl::handler& h) { + h.single_task(f); + }); +} + +// Function: parallel_for +template <typename...ArgsT> +syclTask syclFlow::parallel_for(ArgsT&&... args) { + return on([args...] (sycl::handler& h) { h.parallel_for(args...); }); +} + +// Procedure: offload_until +template <typename P> +void syclFlow::offload_until(P&& predicate) { + + if(!(_graph._state & syclGraph::TOPOLOGY_CHANGED)) { + goto offload; + } + + // levelize the graph + _tpg.clear(); + + // insert the first level of nodes into the queue + for(auto& u : _graph._nodes) { + u->_level = u->_dependents.size(); + if(u->_level == 0) { + _bfs.push(u.get()); + } + } + + while(!_bfs.empty()) { + auto u = _bfs.front(); + _bfs.pop(); + _tpg.push_back(u); + for(auto v : u->_successors) { + if(--(v->_level) == 0) { + v->_level = u->_level + 1; + _bfs.push(v); + } + } + } + + offload: + + // offload the syclFlow graph + bool in_order = _queue.is_in_order(); + + while(!predicate()) { + + // traverse node in a topological order + for(auto u : _tpg) { + + switch(u->_handle.index()) { + // task type 1: command group handler + case syclNode::COMMAND_GROUP_HANDLER: + u->_event = _queue.submit([u, in_order](sycl::handler& h){ + // wait on all predecessors + if(!in_order) { + for(auto p : u->_dependents) { + h.depends_on(p->_event); + } + } + std::get_if<syclNode::CGH>(&u->_handle)->work(h); + }); + break; + } + } + + // synchronize the execution + _queue.wait(); + } + + _graph._state = syclGraph::OFFLOADED; +} + +// Procedure: offload_n +inline void syclFlow::offload_n(size_t n) { + offload_until([repeat=n] () mutable { return repeat-- == 0; }); +} + +// Procedure: offload +inline void syclFlow::offload() { + offload_until([repeat=1] () mutable { return repeat-- == 0; }); +} + +// Function: on +template <typename F, std::enable_if_t< + std::is_invocable_r_v<void, F, sycl::handler&>, void>* +> +void syclFlow::on(syclTask task, F&& f) { + std::get_if<syclNode::CGH>(&task._node->_handle)->work = + std::forward<F>(f); +} + +// Function: memcpy +inline void syclFlow::memcpy( + syclTask task, void* tgt, const void* src, size_t bytes +) { + on(task, [=](sycl::handler& h){ h.memcpy(tgt, src, bytes); }); +} + +// Function: memset +inline void syclFlow::memset( + syclTask task, void* ptr, int value, size_t bytes +) { + on(task, [=](sycl::handler& h){ h.memset(ptr, value, bytes); }); +} + +// Function: fill +template <typename T> +void syclFlow::fill( + syclTask task, void* ptr, const T& pattern, size_t count +) { + on(task, [=](sycl::handler& h){ h.fill(ptr, pattern, count); }); +} + +// Function: copy +template <typename T, + std::enable_if_t<!std::is_same_v<T, void>, void>* +> +void syclFlow::copy( + syclTask task, T* target, const T* source, size_t count +) { + on(task, [=](sycl::handler& h){ + h.memcpy(target, source, count*sizeof(T));} + ); +} + +// Function: parallel_for +template <typename...ArgsT> +void syclFlow::parallel_for(syclTask task, ArgsT&&... args) { + on(task, [args...] (sycl::handler& h) { h.parallel_for(args...); }); +} + +// Function: single_task +template <typename F> +void syclFlow::single_task(syclTask task, F&& func) { + on(task, [f=std::forward<F>(func)] (sycl::handler& h) { h.single_task(f); }); +} + +// ############################################################################ +// Forward declaration: FlowBuilder +// ############################################################################ + +// FlowBuilder::emplace_on +template <typename C, typename Q, std::enable_if_t<is_syclflow_task_v<C>, void>*> +Task FlowBuilder::emplace_on(C&& callable, Q&& q) { + auto n = _graph._emplace_back( + std::in_place_type_t<Node::syclFlow>{}, + [c=std::forward<C>(callable), queue=std::forward<Q>(q)] + (Executor& e, Node* p) mutable { + e._invoke_syclflow_task_entry(p, c, queue); + }, + std::make_unique<syclGraph>() + ); + return Task(n); +} + +// FlowBuilder::emplace +template <typename C, std::enable_if_t<is_syclflow_task_v<C>, void>*> +Task FlowBuilder::emplace(C&& callable) { + return emplace_on(std::forward<C>(callable), sycl::queue{}); +} + +// ############################################################################ +// Forward declaration: Executor +// ############################################################################ + +// Procedure: _invoke_syclflow_task_entry (syclFlow) +template <typename C, typename Q, + std::enable_if_t<is_syclflow_task_v<C>, void>* +> +void Executor::_invoke_syclflow_task_entry(Node* node, C&& c, Q& queue) { + + auto h = std::get_if<Node::syclFlow>(&node->_handle); + + syclGraph* g = dynamic_cast<syclGraph*>(h->graph.get()); + + g->clear(); + + syclFlow sf(*this, *g, queue); + + c(sf); + + if(!(g->_state & syclGraph::OFFLOADED)) { + sf.offload(); + } +} + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/taskflow.hpp b/myxpcs/include/taskflow_/taskflow.hpp new file mode 100644 index 0000000..c2403f8 --- /dev/null +++ b/myxpcs/include/taskflow_/taskflow.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include "core/executor.hpp" +#include "core/async.hpp" +#include "algorithm/critical.hpp" + +/** +@dir taskflow +@brief root taskflow include dir +*/ + +/** +@dir taskflow/core +@brief taskflow core include dir +*/ + +/** +@dir taskflow/algorithm +@brief taskflow algorithms include dir +*/ + +/** +@dir taskflow/cuda +@brief taskflow CUDA include dir +*/ + +/** +@file taskflow/taskflow.hpp +@brief main taskflow include file +*/ + +// TF_VERSION % 100 is the patch level +// TF_VERSION / 100 % 1000 is the minor version +// TF_VERSION / 100000 is the major version + +// current version: 3.7.0 +#define TF_VERSION 300700 + +#define TF_MAJOR_VERSION TF_VERSION/100000 +#define TF_MINOR_VERSION TF_VERSION/100%1000 +#define TF_PATCH_VERSION TF_VERSION%100 + +/** +@brief taskflow namespace +*/ +namespace tf { + +/** +@private +*/ +namespace detail { } + + +/** +@brief queries the version information in a string format @c major.minor.patch + +Release notes are available here: https://taskflow.github.io/taskflow/Releases.html +*/ +constexpr const char* version() { + return "3.7.0"; +} + + +} // end of namespace tf ----------------------------------------------------- + + + + + diff --git a/myxpcs/include/taskflow_/utility/iterator.hpp b/myxpcs/include/taskflow_/utility/iterator.hpp new file mode 100644 index 0000000..8636a3b --- /dev/null +++ b/myxpcs/include/taskflow_/utility/iterator.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include <cstddef> +#include <type_traits> + +namespace tf { + +template <typename T> +constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, bool> +is_range_invalid(T beg, T end, T step) { + return ((step == 0 && beg != end) || + (beg < end && step <= 0) || // positive range + (beg > end && step >= 0)); // negative range +} + +template <typename T> +constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, size_t> +distance(T beg, T end, T step) { + return (end - beg + step + (step > 0 ? -1 : 1)) / step; +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/utility/macros.hpp b/myxpcs/include/taskflow_/utility/macros.hpp new file mode 100644 index 0000000..e7598cf --- /dev/null +++ b/myxpcs/include/taskflow_/utility/macros.hpp @@ -0,0 +1,17 @@ +#pragma once + +#if defined(_MSC_VER) + #define TF_FORCE_INLINE __forceinline +#elif defined(__GNUC__) && __GNUC__ > 3 + #define TF_FORCE_INLINE __attribute__((__always_inline__)) inline +#else + #define TF_FORCE_INLINE inline +#endif + +#if defined(_MSC_VER) + #define TF_NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) && __GNUC__ > 3 + #define TF_NO_INLINE __attribute__((__noinline__)) +#else + #define TF_NO_INLINE +#endif diff --git a/myxpcs/include/taskflow_/utility/math.hpp b/myxpcs/include/taskflow_/utility/math.hpp new file mode 100644 index 0000000..f80053e --- /dev/null +++ b/myxpcs/include/taskflow_/utility/math.hpp @@ -0,0 +1,151 @@ +#pragma once + +#include <atomic> + +namespace tf { + +// rounds the given 64-bit unsigned integer to the nearest power of 2 +template <typename T, std::enable_if_t< + (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 8) , void +>* = nullptr> +constexpr T next_pow2(T x) { + if(x == 0) return 1; + x--; + x |= x>>1; + x |= x>>2; + x |= x>>4; + x |= x>>8; + x |= x>>16; + x |= x>>32; + x++; + return x; +} + +// rounds the given 32-bit unsigned integer to the nearest power of 2 +template <typename T, std::enable_if_t< + (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 4), void +>* = nullptr> +constexpr T next_pow2(T x) { + if(x == 0) return 1; + x--; + x |= x>>1; + x |= x>>2; + x |= x>>4; + x |= x>>8; + x |= x>>16; + x++; + return x; +} + +// checks if the given number if a power of 2 +template <typename T, std::enable_if_t< + std::is_integral_v<std::decay_t<T>>, void>* = nullptr +> +constexpr bool is_pow2(const T& x) { + return x && (!(x&(x-1))); +} + +//// finds the ceil of x divided by b +//template <typename T, std::enable_if_t< +// std::is_integral_v<std::decay_t<T>>, void>* = nullptr +//> +//constexpr T ceil(const T& x, const T& y) { +// //return (x + y - 1) / y; +// return (x-1) / y + 1; +//} + +/** +@brief returns floor(log2(n)), assumes n > 0 +*/ +template<typename T> +constexpr int log2(T n) { + int log = 0; + while (n >>= 1) { + ++log; + } + return log; +} + +/** +@brief finds the median of three numbers of dereferenced iterators using + the given comparator +*/ +template <typename RandItr, typename C> +RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) { + return cmp(*l, *m) ? (cmp(*m, *r) ? m : (cmp(*l, *r) ? r : l )) + : (cmp(*r, *m) ? m : (cmp(*r, *l) ? r : l )); +} + +/** +@brief finds the pseudo median of a range of items using spreaded + nine numbers + */ +template <typename RandItr, typename C> +RandItr pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) { + size_t N = std::distance(beg, end); + size_t offset = N >> 3; + return median_of_three( + median_of_three(beg, beg+offset, beg+(offset*2), cmp), + median_of_three(beg+(offset*3), beg+(offset*4), beg+(offset*5), cmp), + median_of_three(beg+(offset*6), beg+(offset*7), end-1, cmp), + cmp + ); +} + +/** +@brief sorts two elements of dereferenced iterators using the given + comparison function +*/ +template<typename Iter, typename Compare> +void sort2(Iter a, Iter b, Compare comp) { + if (comp(*b, *a)) std::iter_swap(a, b); +} + +/** +@brief sorts three elements of dereferenced iterators using the given + comparison function +*/ +template<typename Iter, typename Compare> +void sort3(Iter a, Iter b, Iter c, Compare comp) { + sort2(a, b, comp); + sort2(b, c, comp); + sort2(a, b, comp); +} + +/** +@brief generates a program-wise unique id of the give type (thread-safe) +*/ +template <typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr> +T unique_id() { + static std::atomic<T> counter{0}; + return counter.fetch_add(1, std::memory_order_relaxed); +} + +/** +@brief updates an atomic variable with a maximum value +*/ +template <typename T> +inline void atomic_max(std::atomic<T>& v, const T& max_v) noexcept { + T prev = v.load(std::memory_order_relaxed); + while(prev < max_v && + !v.compare_exchange_weak(prev, max_v, std::memory_order_relaxed, + std::memory_order_relaxed)) { + } +} + +/** +@brief updates an atomic variable with a minimum value +*/ +template <typename T> +inline void atomic_min(std::atomic<T>& v, const T& min_v) noexcept { + T prev = v.load(std::memory_order_relaxed); + while(prev > min_v && + !v.compare_exchange_weak(prev, min_v, std::memory_order_relaxed, + std::memory_order_relaxed)) { + } +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/utility/object_pool.hpp b/myxpcs/include/taskflow_/utility/object_pool.hpp new file mode 100644 index 0000000..34d60fb --- /dev/null +++ b/myxpcs/include/taskflow_/utility/object_pool.hpp @@ -0,0 +1,778 @@ +// 2020/03/13 - modified by Tsung-Wei Huang +// - fixed bug in aligning memory +// +// 2020/02/02 - modified by Tsung-Wei Huang +// - new implementation motivated by Hoard +// +// 2019/07/10 - modified by Tsung-Wei Huang +// - replace raw pointer with smart pointer +// +// 2019/06/13 - created by Tsung-Wei Huang +// - implemented an object pool class + +#pragma once + +#include <thread> +#include <atomic> +#include <mutex> +#include <vector> +#include <cassert> +#include <cstddef> + +namespace tf { + +#define TF_ENABLE_POOLABLE_ON_THIS \ + template <typename T, size_t S> friend class ObjectPool; \ + void* _object_pool_block + +// Class: ObjectPool +// +// The class implements an efficient thread-safe object pool motivated +// by the Hoard memory allocator algorithm. +// Different from the normal memory allocator, object pool allocates +// only one object at a time. +// +// Internall, we use the following variables to maintain blocks and heaps: +// X: size in byte of a item slot +// M: number of items per block +// F: emptiness threshold +// B: number of bins per local heap (bin[B-1] is the full list) +// W: number of items per bin +// K: shrinkness constant +// +// Example scenario 1: +// M = 30 +// F = 4 +// W = (30+4-1)/4 = 8 +// +// b0: 0, 1, 2, 3, 4, 5, 6, 7 +// b1: 8, 9, 10, 11, 12, 13, 14, 15 +// b2: 16, 17, 18, 19, 20, 21, 22, 23 +// b3: 24, 25, 26, 27, 28, 29 +// b4: 30 (anything equal to M) +// +// Example scenario 2: +// M = 32 +// F = 4 +// W = (32+4-1)/4 = 8 +// b0: 0, 1, 2, 3, 4, 5, 6, 7 +// b1: 8, 9, 10, 11, 12, 13, 14, 15 +// b2: 16, 17, 18, 19, 20, 21, 22, 23 +// b3: 24, 25, 26, 27, 28, 29, 30, 31 +// b4: 32 (anything equal to M) +// +template <typename T, size_t S = 65536> +class ObjectPool { + + // the data column must be sufficient to hold the pointer in freelist + constexpr static size_t X = (std::max)(sizeof(T*), sizeof(T)); + //constexpr static size_t X = sizeof(long double) + std::max(sizeof(T*), sizeof(T)); + //constexpr static size_t M = (S - offsetof(Block, data)) / X; + constexpr static size_t M = S / X; + constexpr static size_t F = 4; + constexpr static size_t B = F + 1; + constexpr static size_t W = (M + F - 1) / F; + constexpr static size_t K = 4; + + static_assert( + S && (!(S & (S-1))), "block size S must be a power of two" + ); + + static_assert( + M >= 128, "block size S must be larger enough to pool at least 128 objects" + ); + + struct Blocklist { + Blocklist* prev; + Blocklist* next; + }; + + struct GlobalHeap { + std::mutex mutex; + Blocklist list; + }; + + struct LocalHeap { + std::mutex mutex; + Blocklist lists[B]; + size_t u {0}; + size_t a {0}; + }; + + struct Block { + std::atomic<LocalHeap*> heap; + Blocklist list_node; + size_t i; + size_t u; + T* top; + // long double padding; + char data[S]; + }; + + public: + + /** + @brief constructs an object pool from a number of anticipated threads + */ + explicit ObjectPool(unsigned = std::thread::hardware_concurrency()); + + /** + @brief destructs the object pool + */ + ~ObjectPool(); + + /** + @brief acquires a pointer to a object constructed from a given argument list + */ + template <typename... ArgsT> + T* animate(ArgsT&&... args); + + /** + @brief recycles a object pointed by @c ptr and destroys it + */ + void recycle(T* ptr); + + size_t num_bins_per_local_heap() const; + size_t num_objects_per_bin() const; + size_t num_objects_per_block() const; + size_t num_available_objects() const; + size_t num_allocated_objects() const; + size_t capacity() const; + size_t num_local_heaps() const; + size_t num_global_heaps() const; + size_t num_heaps() const; + + float emptiness_threshold() const; + + private: + + const size_t _lheap_mask; + + GlobalHeap _gheap; + + std::vector<LocalHeap> _lheaps; + + LocalHeap& _this_heap(); + + constexpr unsigned _next_pow2(unsigned n) const; + + template <class P, class Q> + constexpr size_t _offset_in_class(const Q P::*member) const; + + template <class P, class Q> + constexpr P* _parent_class_of(Q*, const Q P::*member); + + template <class P, class Q> + constexpr P* _parent_class_of(const Q*, const Q P::*member) const; + + constexpr Block* _block_of(Blocklist*); + constexpr Block* _block_of(const Blocklist*) const; + + size_t _bin(size_t) const; + + T* _allocate(Block*); + + void _deallocate(Block*, T*); + void _blocklist_init_head(Blocklist*); + void _blocklist_add_impl(Blocklist*, Blocklist*, Blocklist*); + void _blocklist_push_front(Blocklist*, Blocklist*); + void _blocklist_push_back(Blocklist*, Blocklist*); + void _blocklist_del_impl(Blocklist*, Blocklist*); + void _blocklist_del(Blocklist*); + void _blocklist_replace(Blocklist*, Blocklist*); + void _blocklist_move_front(Blocklist*, Blocklist*); + void _blocklist_move_back(Blocklist*, Blocklist*); + bool _blocklist_is_first(const Blocklist*, const Blocklist*); + bool _blocklist_is_last(const Blocklist*, const Blocklist*); + bool _blocklist_is_empty(const Blocklist*); + bool _blocklist_is_singular(const Blocklist*); + + template <typename C> + void _for_each_block_safe(Blocklist*, C&&); + + template <typename C> + void _for_each_block(Blocklist*, C&&); + +}; + +// ---------------------------------------------------------------------------- +// ObjectPool definition +// ---------------------------------------------------------------------------- + +// Constructor +template <typename T, size_t S> +ObjectPool<T, S>::ObjectPool(unsigned t) : + //_heap_mask {(_next_pow2(t) << 1) - 1u}, + //_heap_mask { _next_pow2(t<<1) - 1u }, + //_heap_mask {(t << 1) - 1}, + _lheap_mask { _next_pow2((t+1) << 1) - 1 }, + _lheaps { _lheap_mask + 1 } { + + _blocklist_init_head(&_gheap.list); + + for(auto& h : _lheaps) { + for(size_t i=0; i<B; ++i) { + _blocklist_init_head(&h.lists[i]); + } + } +} + +// Destructor +template <typename T, size_t S> +ObjectPool<T, S>::~ObjectPool() { + + // clear local heaps + for(auto& h : _lheaps) { + for(size_t i=0; i<B; ++i) { + _for_each_block_safe(&h.lists[i], [] (Block* b) { + //std::free(b); + delete b; + }); + } + } + + // clear global heap + _for_each_block_safe(&_gheap.list, [] (Block* b) { + //std::free(b); + delete b; + }); +} + +// Function: num_bins_per_local_heap +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_bins_per_local_heap() const { + return B; +} + +// Function: num_objects_per_bin +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_objects_per_bin() const { + return W; +} + +// Function: num_objects_per_block +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_objects_per_block() const { + return M; +} + +// Function: emptiness_threshold +template <typename T, size_t S> +float ObjectPool<T, S>::emptiness_threshold() const { + return 1.0f/F; +} + +// Function: num_global_heaps +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_global_heaps() const { + return 1; +} + +// Function: num_lheaps +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_local_heaps() const { + return _lheaps.size(); +} + +// Function: num_heaps +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_heaps() const { + return _lheaps.size() + 1; +} + +// Function: capacity +template <typename T, size_t S> +size_t ObjectPool<T, S>::capacity() const { + + size_t n = 0; + + // global heap + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + n += M; + }; + + // local heap + for(auto& h : _lheaps) { + n += h.a; + } + + return n; +} + +// Function: num_available_objects +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_available_objects() const { + + size_t n = 0; + + // global heap + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + n += (M - _block_of(p)->u); + }; + + // local heap + for(auto& h : _lheaps) { + n += (h.a - h.u); + } + return n; +} + +// Function: num_allocated_objects +template <typename T, size_t S> +size_t ObjectPool<T, S>::num_allocated_objects() const { + + size_t n = 0; + + // global heap + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + n += _block_of(p)->u; + }; + + // local heap + for(auto& h : _lheaps) { + n += h.u; + } + return n; +} + +// Function: _bin +template <typename T, size_t S> +size_t ObjectPool<T, S>::_bin(size_t u) const { + return u == M ? F : u/W; +} + +// Function: _offset_in_class +template <typename T, size_t S> +template <class P, class Q> +constexpr size_t ObjectPool<T, S>::_offset_in_class( + const Q P::*member) const { + return (size_t) &( reinterpret_cast<P*>(0)->*member); +} + +// C macro: parent_class_of(list_pointer, Block, list) +// C++: parent_class_of(list_pointer, &Block::list) +template <typename T, size_t S> +template <class P, class Q> +constexpr P* ObjectPool<T, S>::_parent_class_of( + Q* ptr, const Q P::*member +) { + return (P*)( (char*)ptr - _offset_in_class(member)); +} + +// Function: _parent_class_of +template <typename T, size_t S> +template <class P, class Q> +constexpr P* ObjectPool<T, S>::_parent_class_of( + const Q* ptr, const Q P::*member +) const { + return (P*)( (char*)ptr - _offset_in_class(member)); +} + +// Function: _block_of +template <typename T, size_t S> +constexpr typename ObjectPool<T, S>::Block* +ObjectPool<T, S>::_block_of(Blocklist* list) { + return _parent_class_of(list, &Block::list_node); +} + +// Function: _block_of +template <typename T, size_t S> +constexpr typename ObjectPool<T, S>::Block* +ObjectPool<T, S>::_block_of(const Blocklist* list) const { + return _parent_class_of(list, &Block::list_node); +} + +// Procedure: initialize a list head +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_init_head(Blocklist *list) { + list->next = list; + list->prev = list; +} + +// Procedure: _blocklist_add_impl +// Insert a new entry between two known consecutive entries. +// +// This is only for internal list manipulation where we know +// the prev/next entries already! +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_add_impl( + Blocklist *curr, Blocklist *prev, Blocklist *next +) { + next->prev = curr; + curr->next = next; + curr->prev = prev; + prev->next = curr; +} + +// list_push_front - add a new entry +// @curr: curr entry to be added +// @head: list head to add it after +// +// Insert a new entry after the specified head. +// This is good for implementing stacks. +// +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_push_front( + Blocklist *curr, Blocklist *head +) { + _blocklist_add_impl(curr, head, head->next); +} + +// list_add_tail - add a new entry +// @curr: curr entry to be added +// @head: list head to add it before +// +// Insert a new entry before the specified head. +// This is useful for implementing queues. +// +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_push_back( + Blocklist *curr, Blocklist *head +) { + _blocklist_add_impl(curr, head->prev, head); +} + +// Delete a list entry by making the prev/next entries +// point to each other. +// +// This is only for internal list manipulation where we know +// the prev/next entries already! +// +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_del_impl( + Blocklist * prev, Blocklist * next +) { + next->prev = prev; + prev->next = next; +} + +// _blocklist_del - deletes entry from list. +// @entry: the element to delete from the list. +// Note: list_empty() on entry does not return true after this, the entry is +// in an undefined state. +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_del(Blocklist *entry) { + _blocklist_del_impl(entry->prev, entry->next); + entry->next = nullptr; + entry->prev = nullptr; +} + +// list_replace - replace old entry by new one +// @old : the element to be replaced +// @curr : the new element to insert +// +// If @old was empty, it will be overwritten. +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_replace( + Blocklist *old, Blocklist *curr +) { + curr->next = old->next; + curr->next->prev = curr; + curr->prev = old->prev; + curr->prev->next = curr; +} + +// list_move - delete from one list and add as another's head +// @list: the entry to move +// @head: the head that will precede our entry +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_move_front( + Blocklist *list, Blocklist *head +) { + _blocklist_del_impl(list->prev, list->next); + _blocklist_push_front(list, head); +} + +// list_move_tail - delete from one list and add as another's tail +// @list: the entry to move +// @head: the head that will follow our entry +template <typename T, size_t S> +void ObjectPool<T, S>::_blocklist_move_back( + Blocklist *list, Blocklist *head +) { + _blocklist_del_impl(list->prev, list->next); + _blocklist_push_back(list, head); +} + +// list_is_first - tests whether @list is the last entry in list @head +// @list: the entry to test +// @head: the head of the list +template <typename T, size_t S> +bool ObjectPool<T, S>::_blocklist_is_first( + const Blocklist *list, const Blocklist *head +) { + return list->prev == head; +} + +// list_is_last - tests whether @list is the last entry in list @head +// @list: the entry to test +// @head: the head of the list +template <typename T, size_t S> +bool ObjectPool<T, S>::_blocklist_is_last( + const Blocklist *list, const Blocklist *head +) { + return list->next == head; +} + +// list_empty - tests whether a list is empty +// @head: the list to test. +template <typename T, size_t S> +bool ObjectPool<T, S>::_blocklist_is_empty(const Blocklist *head) { + return head->next == head; +} + +// list_is_singular - tests whether a list has just one entry. +// @head: the list to test. +template <typename T, size_t S> +bool ObjectPool<T, S>::_blocklist_is_singular( + const Blocklist *head +) { + return !_blocklist_is_empty(head) && (head->next == head->prev); +} + +// Procedure: _for_each_block +template <typename T, size_t S> +template <typename C> +void ObjectPool<T, S>::_for_each_block(Blocklist* head, C&& c) { + Blocklist* p; + for(p=head->next; p!=head; p=p->next) { + c(_block_of(p)); + } +} + +// Procedure: _for_each_block_safe +// Iterate each item of a list - safe to free +template <typename T, size_t S> +template <typename C> +void ObjectPool<T, S>::_for_each_block_safe(Blocklist* head, C&& c) { + Blocklist* p; + Blocklist* t; + for(p=head->next, t=p->next; p!=head; p=t, t=p->next) { + c(_block_of(p)); + } +} + +// Function: _allocate +// allocate a spot from the block +template <typename T, size_t S> +T* ObjectPool<T, S>::_allocate(Block* s) { + if(s->top == nullptr) { + return reinterpret_cast<T*>(s->data + s->i++ * X); + } + else { + T* retval = s->top; + s->top = *(reinterpret_cast<T**>(s->top)); + return retval; + } +} + +// Procedure: _deallocate +template <typename T, size_t S> +void ObjectPool<T, S>::_deallocate(Block* s, T* ptr) { + *(reinterpret_cast<T**>(ptr)) = s->top; + s->top = ptr; +} + +// Function: allocate +template <typename T, size_t S> +template <typename... ArgsT> +T* ObjectPool<T, S>::animate(ArgsT&&... args) { + + //std::cout << "construct a new item\n"; + + // my logically mapped heap + LocalHeap& h = _this_heap(); + + Block* s {nullptr}; + + h.mutex.lock(); + + // scan the list of superblocks from the most full to the least full + int f = static_cast<int>(F-1); + for(; f>=0; f--) { + if(!_blocklist_is_empty(&h.lists[f])) { + s = _block_of(h.lists[f].next); + break; + } + } + + // no superblock found + if(f == -1) { + + // check heap 0 for a superblock + _gheap.mutex.lock(); + if(!_blocklist_is_empty(&_gheap.list)) { + + s = _block_of(_gheap.list.next); + + //printf("get a superblock from global heap %lu\n", s->u); + assert(s->u < M && s->heap == nullptr); + f = static_cast<int>(_bin(s->u + 1)); + + _blocklist_move_front(&s->list_node, &h.lists[f]); + + s->heap = &h; // must be within the global heap lock + _gheap.mutex.unlock(); + + h.u = h.u + s->u; + h.a = h.a + M; + } + // create a new block + else { + //printf("create a new superblock\n"); + _gheap.mutex.unlock(); + f = 0; + //s = static_cast<Block*>(std::malloc(sizeof(Block))); + s = new Block(); + + if(s == nullptr) { + throw std::bad_alloc(); + } + + s->heap = &h; + s->i = 0; + s->u = 0; + s->top = nullptr; + + _blocklist_push_front(&s->list_node, &h.lists[f]); + + h.a = h.a + M; + } + } + + // the superblock must have at least one space + //assert(s->u < M); + //printf("%lu %lu %lu\n", h.u, h.a, s->u); + //assert(h.u < h.a); + + h.u = h.u + 1; + s->u = s->u + 1; + + // take one item from the superblock + T* mem = _allocate(s); + + int b = static_cast<int>(_bin(s->u)); + + if(b != f) { + //printf("move superblock from list[%d] to list[%d]\n", f, b); + _blocklist_move_front(&s->list_node, &h.lists[b]); + } + + //std::cout << "s.i " << s->i << '\n' + // << "s.u " << s->u << '\n' + // << "h.u " << h.u << '\n' + // << "h.a " << h.a << '\n'; + + h.mutex.unlock(); + + //printf("allocate %p (s=%p)\n", mem, s); + + new (mem) T(std::forward<ArgsT>(args)...); + + mem->_object_pool_block = s; + + return mem; +} + +// Function: destruct +template <typename T, size_t S> +void ObjectPool<T, S>::recycle(T* mem) { + + //Block* s = *reinterpret_cast<Block**>( + // reinterpret_cast<char*>(mem) - sizeof(Block**) + //); + + //Block* s= *(reinterpret_cast<Block**>(mem) - O); // (mem) - 1 + + Block* s = static_cast<Block*>(mem->_object_pool_block); + + mem->~T(); + + //printf("deallocate %p (s=%p) M=%lu W=%lu X=%lu\n", mem, s, M, W, X); + + // here we need a loop because when we lock the heap, + // other threads may have removed the superblock to another heap + bool sync = false; + + do { + LocalHeap* h = s->heap.load(std::memory_order_relaxed); + + // the block is in global heap + if(h == nullptr) { + std::lock_guard<std::mutex> glock(_gheap.mutex); + if(s->heap == h) { + sync = true; + _deallocate(s, mem); + s->u = s->u - 1; + } + } + else { + std::lock_guard<std::mutex> llock(h->mutex); + if(s->heap == h) { + sync = true; + // deallocate the item from the superblock + size_t f = _bin(s->u); + _deallocate(s, mem); + s->u = s->u - 1; + h->u = h->u - 1; + + size_t b = _bin(s->u); + + if(b != f) { + //printf("move superblock from list[%d] to list[%d]\n", f, b); + _blocklist_move_front(&s->list_node, &h->lists[b]); + } + + // transfer a mostly-empty superblock to global heap + if((h->u + K*M < h->a) && (h->u < ((F-1) * h->a / F))) { + for(size_t i=0; i<F; i++) { + if(!_blocklist_is_empty(&h->lists[i])) { + Block* x = _block_of(h->lists[i].next); + //printf("transfer a block (x.u=%lu/x.i=%lu) to the global heap\n", x->u, x->i); + assert(h->u > x->u && h->a > M); + h->u = h->u - x->u; + h->a = h->a - M; + x->heap = nullptr; + std::lock_guard<std::mutex> glock(_gheap.mutex); + _blocklist_move_front(&x->list_node, &_gheap.list); + break; + } + } + } + } + } + } while(!sync); + + //std::cout << "s.i " << s->i << '\n' + // << "s.u " << s->u << '\n'; +} + +// Function: _this_heap +template <typename T, size_t S> +typename ObjectPool<T, S>::LocalHeap& +ObjectPool<T, S>::_this_heap() { + // here we don't use thread local since object pool might be + // created and destroyed multiple times + //thread_local auto hv = std::hash<std::thread::id>()(std::this_thread::get_id()); + //return _lheaps[hv & _lheap_mask]; + + return _lheaps[ + std::hash<std::thread::id>()(std::this_thread::get_id()) & _lheap_mask + ]; +} + +// Function: _next_pow2 +template <typename T, size_t S> +constexpr unsigned ObjectPool<T, S>::_next_pow2(unsigned n) const { + if(n == 0) return 1; + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n++; + return n; +} + +} // end namespace tf -------------------------------------------------------- diff --git a/myxpcs/include/taskflow_/utility/os.hpp b/myxpcs/include/taskflow_/utility/os.hpp new file mode 100644 index 0000000..23ac301 --- /dev/null +++ b/myxpcs/include/taskflow_/utility/os.hpp @@ -0,0 +1,196 @@ +#pragma once + +#include <cstdlib> +#include <cstdio> +#include <string> + +#define TF_OS_LINUX 0 +#define TF_OS_DRAGONFLY 0 +#define TF_OS_FREEBSD 0 +#define TF_OS_NETBSD 0 +#define TF_OS_OPENBSD 0 +#define TF_OS_DARWIN 0 +#define TF_OS_WINDOWS 0 +#define TF_OS_CNK 0 +#define TF_OS_HURD 0 +#define TF_OS_SOLARIS 0 +#define TF_OS_UNIX 0 + +#ifdef _WIN32 +#undef TF_OS_WINDOWS +#define TF_OS_WINDOWS 1 +#endif + +#ifdef __CYGWIN__ +#undef TF_OS_WINDOWS +#define TF_OS_WINDOWS 1 +#endif + +#if (defined __APPLE__ && defined __MACH__) +#undef TF_OS_DARWIN +#define TF_OS_DARWIN 1 +#endif + +// in some ppc64 linux installations, only the second condition is met +#if (defined __linux) +#undef TF_OS_LINUX +#define TF_OS_LINUX 1 +#elif (defined __linux__) +#undef TF_OS_LINUX +#define TF_OS_LINUX 1 +#else +#endif + +#if (defined __DragonFly__) +#undef TF_OS_DRAGONFLY +#define TF_OS_DRAGONFLY 1 +#endif + +#if (defined __FreeBSD__) +#undef TF_OS_FREEBSD +#define TF_OS_FREEBSD 1 +#endif + +#if (defined __NetBSD__) +#undef TF_OS_NETBSD +#define TF_OS_NETBSD 1 +#endif + +#if (defined __OpenBSD__) +#undef TF_OS_OPENBSD +#define TF_OS_OPENBSD 1 +#endif + +#if (defined __bgq__) +#undef TF_OS_CNK +#define TF_OS_CNK 1 +#endif + +#if (defined __GNU__) +#undef TF_OS_HURD +#define TF_OS_HURD 1 +#endif + +#if (defined __sun) +#undef TF_OS_SOLARIS +#define TF_OS_SOLARIS 1 +#endif + +#if (1 != \ + TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD + \ + TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_HURD + \ + TF_OS_SOLARIS) +#define TF_OS_UNKNOWN 1 +#endif + +#if TF_OS_LINUX || TF_OS_DRAGONFLY || TF_OS_FREEBSD || TF_OS_NETBSD || \ + TF_OS_OPENBSD || TF_OS_DARWIN || TF_OS_HURD || TF_OS_SOLARIS +#undef TF_OS_UNIX +#define TF_OS_UNIX 1 +#endif + + +//----------------------------------------------------------------------------- +// Cache line alignment +//----------------------------------------------------------------------------- +#if defined(__i386__) || defined(__x86_64__) + #define TF_CACHELINE_SIZE 64 +#elif defined(__powerpc64__) + // TODO + // This is the L1 D-cache line size of our Power7 machines. + // Need to check if this is appropriate for other PowerPC64 systems. + #define TF_CACHELINE_SIZE 128 +#elif defined(__arm__) + // Cache line sizes for ARM: These values are not strictly correct since + // cache line sizes depend on implementations, not architectures. + // There are even implementations with cache line sizes configurable + // at boot time. + #if defined(__ARM_ARCH_5T__) + #define TF_CACHELINE_SIZE 32 + #elif defined(__ARM_ARCH_7A__) + #define TF_CACHELINE_SIZE 64 + #endif +#endif + +#ifndef TF_CACHELINE_SIZE +// A reasonable default guess. Note that overestimates tend to waste more +// space, while underestimates tend to waste more time. + #define TF_CACHELINE_SIZE 64 +#endif + + + +//----------------------------------------------------------------------------- +// pause +//----------------------------------------------------------------------------- +//#if __has_include (<immintrin.h>) +// #define TF_HAS_MM_PAUSE 1 +// #include <immintrin.h> +//#endif + +namespace tf { + +// Struct: CachelineAligned +// Due to prefetch, we typically do 2x cacheline for the alignment. +template <typename T> +struct CachelineAligned { + alignas (2*TF_CACHELINE_SIZE) T data; +}; + +// Function: get_env +inline std::string get_env(const std::string& str) { +#ifdef _MSC_VER + char *ptr = nullptr; + size_t len = 0; + + if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) { + std::string res(ptr, len); + std::free(ptr); + return res; + } + return ""; + +#else + auto ptr = std::getenv(str.c_str()); + return ptr ? ptr : ""; +#endif +} + +// Function: has_env +inline bool has_env(const std::string& str) { +#ifdef _MSC_VER + char *ptr = nullptr; + size_t len = 0; + + if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) { + std::string res(ptr, len); + std::free(ptr); + return true; + } + return false; + +#else + auto ptr = std::getenv(str.c_str()); + return ptr ? true : false; +#endif +} + +// Procedure: relax_cpu +//inline void relax_cpu() { +//#ifdef TF_HAS_MM_PAUSE +// _mm_pause(); +//#endif +//} + + + +} // end of namespace tf ----------------------------------------------------- + + + + + + + + + diff --git a/myxpcs/include/taskflow_/utility/serializer.hpp b/myxpcs/include/taskflow_/utility/serializer.hpp new file mode 100644 index 0000000..aab00f2 --- /dev/null +++ b/myxpcs/include/taskflow_/utility/serializer.hpp @@ -0,0 +1,1135 @@ +#pragma once + +#include <type_traits> +#include <iterator> +#include <iostream> +#include <fstream> +#include <stack> +#include <queue> +#include <vector> +#include <algorithm> +#include <memory> +#include <functional> +#include <map> +#include <set> +#include <unordered_map> +#include <unordered_set> +#include <sstream> +#include <list> +#include <forward_list> +#include <numeric> +#include <iomanip> +#include <cassert> +#include <cmath> +#include <array> +#include <string> +#include <variant> +#include <optional> + +namespace tf { + +// ---------------------------------------------------------------------------- +// Supported C++ STL type +// ---------------------------------------------------------------------------- + +// std::basic_string +template <typename T> +struct is_std_basic_string : std::false_type {}; + +template <typename... ArgsT> +struct is_std_basic_string <std::basic_string<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_basic_string_v = is_std_basic_string<T>::value; + +// std::array +template <typename T> +struct is_std_array : std::false_type {}; + +template <typename T, size_t N> +struct is_std_array <std::array<T, N>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_array_v = is_std_array<T>::value; + +// std::vector +template <typename T> +struct is_std_vector : std::false_type {}; + +template <typename... ArgsT> +struct is_std_vector <std::vector<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_vector_v = is_std_vector<T>::value; + +// std::deque +template <typename T> +struct is_std_deque : std::false_type {}; + +template <typename... ArgsT> +struct is_std_deque <std::deque<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_deque_v = is_std_deque<T>::value; + +// std::list +template <typename T> +struct is_std_list : std::false_type {}; + +template <typename... ArgsT> +struct is_std_list <std::list<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_list_v = is_std_list<T>::value; + +// std::forward_list +template <typename T> +struct is_std_forward_list : std::false_type {}; + +template <typename... ArgsT> +struct is_std_forward_list <std::forward_list<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_forward_list_v = is_std_forward_list<T>::value; + +// std::map +template <typename T> +struct is_std_map : std::false_type {}; + +template <typename... ArgsT> +struct is_std_map <std::map<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_map_v = is_std_map<T>::value; + +// std::unordered_map +template <typename T> +struct is_std_unordered_map : std::false_type {}; + +template <typename... ArgsT> +struct is_std_unordered_map <std::unordered_map<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_unordered_map_v = is_std_unordered_map<T>::value; + +// std::set +template <typename T> +struct is_std_set : std::false_type {}; + +template <typename... ArgsT> +struct is_std_set <std::set<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_set_v = is_std_set<T>::value; + +// std::unordered_set +template <typename T> +struct is_std_unordered_set : std::false_type {}; + +template <typename... ArgsT> +struct is_std_unordered_set <std::unordered_set<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_unordered_set_v = is_std_unordered_set<T>::value; + +// std::variant +template <typename T> +struct is_std_variant : std::false_type {}; + +template <typename... ArgsT> +struct is_std_variant <std::variant<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_variant_v = is_std_variant<T>::value; + +// std::optional +template <typename T> +struct is_std_optional : std::false_type {}; + +template <typename... ArgsT> +struct is_std_optional <std::optional<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_optional_v = is_std_optional<T>::value; + +// std::unique_ptr +template <typename T> +struct is_std_unique_ptr : std::false_type {}; + +template <typename... ArgsT> +struct is_std_unique_ptr <std::unique_ptr<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_unique_ptr_v = is_std_unique_ptr<T>::value; + +// std::shared_ptr +template <typename T> +struct is_std_shared_ptr : std::false_type {}; + +template <typename... ArgsT> +struct is_std_shared_ptr <std::shared_ptr<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_shared_ptr_v = is_std_shared_ptr<T>::value; + +// std::duration +template <typename T> struct is_std_duration : std::false_type {}; + +template <typename... ArgsT> +struct is_std_duration<std::chrono::duration<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_duration_v = is_std_duration<T>::value; + +// std::time_point +template <typename T> +struct is_std_time_point : std::false_type {}; + +template <typename... ArgsT> +struct is_std_time_point<std::chrono::time_point<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_time_point_v = is_std_time_point<T>::value; + +// std::tuple +template <typename T> +struct is_std_tuple : std::false_type {}; + +template <typename... ArgsT> +struct is_std_tuple<std::tuple<ArgsT...>> : std::true_type {}; + +template <typename T> +constexpr bool is_std_tuple_v = is_std_tuple<T>::value; + +//----------------------------------------------------------------------------- +// Type extraction. +//----------------------------------------------------------------------------- + +// ExtractType: forward declaration +template <size_t, typename> +struct ExtractType; + +// ExtractType_t: alias interface +template <size_t idx, typename C> +using ExtractType_t = typename ExtractType<idx, C>::type; + +// ExtractType: base +template <template <typename...> typename C, typename T, typename... RestT> +struct ExtractType <0, C<T, RestT...>> { + using type = T; +}; + +// ExtractType: base +template <typename T> +struct ExtractType <0, T> { + using type = T; +}; + +// ExtractType: recursive definition. +template <size_t idx, template <typename...> typename C, typename T, typename... RestT> +struct ExtractType <idx, C<T, RestT...>> : ExtractType<idx-1, C<RestT...>> { +}; + +// ---------------------------------------------------------------------------- +// Size Wrapper +// ---------------------------------------------------------------------------- + +// Struct: SizeTag +// Class that wraps a given size item which can be customized. +template <typename T> +class SizeTag { + + public: + + using type = std::conditional_t<std::is_lvalue_reference_v<T>, T, std::decay_t<T>>; + + SizeTag(T&& item) : _item(std::forward<T>(item)) {} + + SizeTag& operator = (const SizeTag&) = delete; + + inline const T& get() const {return _item;} + + template <typename ArchiverT> + auto save(ArchiverT & ar) const { return ar(_item); } + + template <typename ArchiverT> + auto load(ArchiverT & ar) { return ar(_item); } + + private: + + type _item; +}; + +// Function: make_size_tag +template <typename T> +SizeTag<T> make_size_tag(T&& t) { + return { std::forward<T>(t) }; +} + +// ---------------------------------------------------------------------------- +// Size Wrapper +// ---------------------------------------------------------------------------- + +// Class: MapItem +template <typename KeyT, typename ValueT> +class MapItem { + + public: + + using KeyType = std::conditional_t <std::is_lvalue_reference_v<KeyT>, KeyT, std::decay_t<KeyT>>; + using ValueType = std::conditional_t <std::is_lvalue_reference_v<ValueT>, ValueT, std::decay_t<ValueT>>; + + MapItem(KeyT&& k, ValueT&& v) : _key(std::forward<KeyT>(k)), _value(std::forward<ValueT>(v)) {} + MapItem& operator = (const MapItem&) = delete; + + inline const KeyT& key() const { return _key; } + inline const ValueT& value() const { return _value; } + + template <typename ArchiverT> + auto save(ArchiverT & ar) const { return ar(_key, _value); } + + template <typename ArchiverT> + auto load(ArchiverT & ar) { return ar(_key, _value); } + + private: + + KeyType _key; + ValueType _value; +}; + +// Function: make_kv_pair +template <typename KeyT, typename ValueT> +MapItem<KeyT, ValueT> make_kv_pair(KeyT&& k, ValueT&& v) { + return { std::forward<KeyT>(k), std::forward<ValueT>(v) }; +} + +// ---------------------------------------------------------------------------- +// Serializer Definition +// ---------------------------------------------------------------------------- + +template <typename T> +constexpr auto is_default_serializable_v = ( + std::is_arithmetic_v<T> || + std::is_enum_v<T> || + is_std_basic_string_v<T> || + is_std_vector_v<T> || + is_std_deque_v<T> || + is_std_list_v<T> || + is_std_forward_list_v<T> || + is_std_map_v<T> || + is_std_unordered_map_v<T> || + is_std_set_v<T> || + is_std_unordered_set_v<T> || + is_std_duration_v<T> || + is_std_time_point_v<T> || + is_std_variant_v<T> || + is_std_optional_v<T> || + is_std_tuple_v<T> || + is_std_array_v<T> +); + + +// Class: Serializer +template <typename Stream, typename SizeType = std::streamsize> +class Serializer { + + public: + + Serializer(Stream& stream); + + template <typename... T> + SizeType operator()(T&&... items); + + private: + + Stream& _stream; + + template <typename T, + std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t< + is_std_deque_v<std::decay_t<T>> || + is_std_list_v<std::decay_t<T>>, + void + >* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t< + is_std_forward_list_v<std::decay_t<T>>, + void + >* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t< + is_std_map_v<std::decay_t<T>> || + is_std_unordered_map_v<std::decay_t<T>>, + void + >* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t< + is_std_set_v<std::decay_t<T>> || + is_std_unordered_set_v<std::decay_t<T>>, + void + >* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + template <typename T, + std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _save(T&&); + + +}; + +// Constructor +template <typename Stream, typename SizeType> +Serializer<Stream, SizeType>::Serializer(Stream& stream) : _stream(stream) { +} + +// Operator () +template <typename Stream, typename SizeType> +template <typename... T> +SizeType Serializer<Stream, SizeType>::operator() (T&&... items) { + return (_save(std::forward<T>(items)) + ...); +} + +// arithmetic data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + _stream.write(reinterpret_cast<const char*>(std::addressof(t)), sizeof(t)); + return sizeof(t); +} + +// std::basic_string +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + using U = std::decay_t<T>; + auto sz = _save(make_size_tag(t.size())); + _stream.write( + reinterpret_cast<const char*>(t.data()), + t.size()*sizeof(typename U::value_type) + ); + return sz + t.size()*sizeof(typename U::value_type); +} + +// std::vector +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + + using U = std::decay_t<T>; + + auto sz = _save(make_size_tag(t.size())); + + if constexpr (std::is_arithmetic_v<typename U::value_type>) { + _stream.write( + reinterpret_cast<const char*>(t.data()), + t.size() * sizeof(typename U::value_type) + ); + sz += t.size() * sizeof(typename U::value_type); + } else { + for(auto&& item : t) { + sz += _save(item); + } + } + + return sz; +} + +// std::list and std::deque +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_deque_v<std::decay_t<T>> || + is_std_list_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + auto sz = _save(make_size_tag(t.size())); + for(auto&& item : t) { + sz += _save(item); + } + return sz; +} + +// std::forward_list +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_forward_list_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + auto sz = _save(make_size_tag(std::distance(t.begin(), t.end()))); + for(auto&& item : t) { + sz += _save(item); + } + return sz; +} + +// std::map and std::unordered_map +template <typename Stream, typename SizeType> +template <typename T, std::enable_if_t< + is_std_map_v<std::decay_t<T>> || + is_std_unordered_map_v<std::decay_t<T>>, + void +>*> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + auto sz = _save(make_size_tag(t.size())); + for(auto&& [k, v] : t) { + sz += _save(make_kv_pair(k, v)); + } + return sz; +} + +// std::set and std::unordered_set +template <typename Stream, typename SizeType> +template <typename T, std::enable_if_t< + is_std_set_v<std::decay_t<T>> || + is_std_unordered_set_v<std::decay_t<T>>, + void +>*> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + auto sz = _save(make_size_tag(t.size())); + for(auto&& item : t) { + sz += _save(item); + } + return sz; +} + +// enum data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + using U = std::decay_t<T>; + return _save(static_cast<std::underlying_type_t<U>>(t)); +} + +// duration data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + return _save(t.count()); +} + +// time point data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + return _save(t.time_since_epoch()); +} + +// optional data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + if(bool flag = t.has_value(); flag) { + return _save(flag) + _save(*t); + } + else { + return _save(flag); + } +} + +// variant type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + return _save(t.index()) + + std::visit([&] (auto&& arg){ return _save(arg);}, t); +} + +// tuple type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + return std::apply( + [&] (auto&&... args) { + return (_save(std::forward<decltype(args)>(args)) + ... + 0); + }, + std::forward<T>(t) + ); +} + +// array +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + + using U = std::decay_t<T>; + + static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero"); + + SizeType sz; + + if constexpr(std::is_arithmetic_v<typename U::value_type>) { + _stream.write(reinterpret_cast<const char*>(t.data()), sizeof(t)); + sz = sizeof(t); + } + else { + sz = 0; + for(auto&& item : t) { + sz += _save(item); + } + } + + return sz; +} + +// custom save method +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* +> +SizeType Serializer<Stream, SizeType>::_save(T&& t) { + return t.save(*this); +} + +// ---------------------------------------------------------------------------- +// DeSerializer Definition +// ---------------------------------------------------------------------------- + +template <typename T> +constexpr auto is_default_deserializable_v = + std::is_arithmetic_v<T> || + std::is_enum_v<T> || + is_std_basic_string_v<T> || + is_std_vector_v<T> || + is_std_deque_v<T> || + is_std_list_v<T> || + is_std_forward_list_v<T> || + is_std_map_v<T> || + is_std_unordered_map_v<T> || + is_std_set_v<T> || + is_std_unordered_set_v<T> || + is_std_duration_v<T> || + is_std_time_point_v<T> || + is_std_variant_v<T> || + is_std_optional_v<T> || + is_std_tuple_v<T> || + is_std_array_v<T>; + +// Class: Deserializer +template <typename Stream, typename SizeType = std::streamsize> +class Deserializer { + + public: + + Deserializer(Stream& stream); + + template <typename... T> + SizeType operator()(T&&... items); + + private: + + Stream& _stream; + + // Function: _variant_helper + template < + size_t I = 0, typename... ArgsT, + std::enable_if_t<I==sizeof...(ArgsT)>* = nullptr + > + SizeType _variant_helper(size_t, std::variant<ArgsT...>&); + + // Function: _variant_helper + template < + size_t I = 0, typename... ArgsT, + std::enable_if_t<I<sizeof...(ArgsT)>* = nullptr + > + SizeType _variant_helper(size_t, std::variant<ArgsT...>&); + + template <typename T, + std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t< + is_std_deque_v<std::decay_t<T>> || + is_std_list_v<std::decay_t<T>> || + is_std_forward_list_v<std::decay_t<T>>, + void + >* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); + + template <typename T, + std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>* = nullptr + > + SizeType _load(T&&); +}; + +// Constructor +template <typename Stream, typename SizeType> +Deserializer<Stream, SizeType>::Deserializer(Stream& stream) : _stream(stream) { +} + +// Operator () +template <typename Stream, typename SizeType> +template <typename... T> +SizeType Deserializer<Stream, SizeType>::operator() (T&&... items) { + return (_load(std::forward<T>(items)) + ...); +} + +// Function: _variant_helper +template <typename Stream, typename SizeType> +template <size_t I, typename... ArgsT, std::enable_if_t<I==sizeof...(ArgsT)>*> +SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t, std::variant<ArgsT...>&) { + return 0; +} + +// Function: _variant_helper +template <typename Stream, typename SizeType> +template <size_t I, typename... ArgsT, std::enable_if_t<I<sizeof...(ArgsT)>*> +SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t i, std::variant<ArgsT...>& v) { + if(i == 0) { + using type = ExtractType_t<I, std::variant<ArgsT...>>; + if(v.index() != I) { + static_assert( + std::is_default_constructible<type>::value, + "Failed to archive variant (type should be default constructible T())" + ); + v = type(); + } + return _load(*std::get_if<type>(&v)); + } + return _variant_helper<I+1, ArgsT...>(i-1, v); +} + +// arithmetic data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + _stream.read(reinterpret_cast<char*>(std::addressof(t)), sizeof(t)); + return sizeof(t); +} + +// std::basic_string +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + typename U::size_type num_chars; + auto sz = _load(make_size_tag(num_chars)); + t.resize(num_chars); + _stream.read(reinterpret_cast<char*>(t.data()), num_chars*sizeof(typename U::value_type)); + return sz + num_chars*sizeof(typename U::value_type); +} + +// std::vector +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + typename U::size_type num_data; + + auto sz = _load(make_size_tag(num_data)); + + if constexpr(std::is_arithmetic_v<typename U::value_type>) { + t.resize(num_data); + _stream.read(reinterpret_cast<char*>(t.data()), num_data * sizeof(typename U::value_type)); + sz += num_data * sizeof(typename U::value_type); + } + else { + t.resize(num_data); + for(auto && v : t) { + sz += _load(v); + } + } + return sz; +} + +// std::list and std::deque +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_deque_v<std::decay_t<T>> || + is_std_list_v<std::decay_t<T>> || + is_std_forward_list_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + + typename U::size_type num_data; + auto sz = _load(make_size_tag(num_data)); + + t.resize(num_data); + for(auto && v : t) { + sz += _load(v); + } + return sz; +} + +// std::map +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + typename U::size_type num_data; + auto sz = _load(make_size_tag(num_data)); + + t.clear(); + auto hint = t.begin(); + + typename U::key_type k; + typename U::mapped_type v; + + for(size_t i=0; i<num_data; ++i) { + sz += _load(make_kv_pair(k, v)); + hint = t.emplace_hint(hint, std::move(k), std::move(v)); + } + return sz; +} + +// std::unordered_map +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + typename U::size_type num_data; + auto sz = _load(make_size_tag(num_data)); + + t.clear(); + t.reserve(num_data); + + typename U::key_type k; + typename U::mapped_type v; + + for(size_t i=0; i<num_data; ++i) { + sz += _load(make_kv_pair(k, v)); + t.emplace(std::move(k), std::move(v)); + } + + return sz; +} + +// std::set +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + typename U::size_type num_data; + auto sz = _load(make_size_tag(num_data)); + + t.clear(); + auto hint = t.begin(); + + typename U::key_type k; + + for(size_t i=0; i<num_data; ++i) { + sz += _load(k); + hint = t.emplace_hint(hint, std::move(k)); + } + return sz; +} + +// std::unordered_set +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + typename U::size_type num_data; + auto sz = _load(make_size_tag(num_data)); + + t.clear(); + t.reserve(num_data); + + typename U::key_type k; + + for(size_t i=0; i<num_data; ++i) { + sz += _load(k); + t.emplace(std::move(k)); + } + return sz; +} + +// enum data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + std::underlying_type_t<U> k; + auto sz = _load(k); + t = static_cast<U>(k); + return sz; +} + +// duration data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + typename U::rep count; + auto s = _load(count); + t = U{count}; + return s; +} + +// time point data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + using U = std::decay_t<T>; + typename U::duration elapsed; + auto s = _load(elapsed); + t = U{elapsed}; + return s; +} + +// optional data type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + bool has_value; + auto s = _load(has_value); + if(has_value) { + if(!t) { + t = typename U::value_type(); + } + s += _load(*t); + } + else { + t.reset(); + } + return s; +} + +// variant type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + std::decay_t<decltype(t.index())> idx; + auto s = _load(idx); + return s + _variant_helper(idx, t); +} + +// tuple type +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + return std::apply( + [&] (auto&&... args) { + return (_load(std::forward<decltype(args)>(args)) + ... + 0); + }, + std::forward<T>(t) + ); +} + +// array +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + + using U = std::decay_t<T>; + + static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero"); + + SizeType sz; + + if constexpr(std::is_arithmetic_v<typename U::value_type>) { + _stream.read(reinterpret_cast<char*>(t.data()), sizeof(t)); + sz = sizeof(t); + } + else { + sz = 0; + for(auto && v : t) { + sz += _load(v); + } + } + + return sz; +} + +// custom save method +template <typename Stream, typename SizeType> +template <typename T, + std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>* +> +SizeType Deserializer<Stream, SizeType>::_load(T&& t) { + return t.load(*this); +} + +} // ned of namespace tf ----------------------------------------------------- + + + + + + diff --git a/myxpcs/include/taskflow_/utility/singleton.hpp b/myxpcs/include/taskflow_/utility/singleton.hpp new file mode 100644 index 0000000..aab50bc --- /dev/null +++ b/myxpcs/include/taskflow_/utility/singleton.hpp @@ -0,0 +1,33 @@ +#pragma once + +namespace tf { + +/** @class Singleton + +@brief class template to create a thread-safe singleton object + +*/ +template <typename T> +class Singleton { + + public: + + /** + @brief get a reference to the singleton object + */ + inline static T& get() { + static T instance; + return instance; + } + + private: + + Singleton() = default; + ~Singleton() = default; + Singleton(const Singleton&)= delete; + Singleton& operator=(const Singleton&)= delete; +}; + + + +} // end of namespace tf ----------------------------------------------------- diff --git a/myxpcs/include/taskflow_/utility/small_vector.hpp b/myxpcs/include/taskflow_/utility/small_vector.hpp new file mode 100644 index 0000000..a42c264 --- /dev/null +++ b/myxpcs/include/taskflow_/utility/small_vector.hpp @@ -0,0 +1,1048 @@ +// small vector modified from llvm + +#pragma once + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdlib> +#include <cstring> +#include <initializer_list> +#include <iterator> +#include <memory> + +#if defined(__GNUC__) + #define TF_LIKELY(x) (__builtin_expect((x), 1)) + #define TF_UNLIKELY(x) (__builtin_expect((x), 0)) +#else + #define TF_LIKELY(x) (x) + #define TF_UNLIKELY(x) (x) +#endif + +/** +@file small_vector.hpp +@brief small vector include file +*/ + +namespace tf { namespace detail { + +/** +@private +@brief NextCapacity - Returns the next power of two (in 64-bits) + that is strictly greater than A. Returns zero on overflow. + this function assumes A to be positive +*/ +inline uint64_t NextCapacity(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +}} // end of namespace tf::detail -------------------------------------------- + + +namespace tf { + +/** +@private +*/ +template <typename T> +struct IsPod : std::integral_constant<bool, std::is_standard_layout<T>::value && + std::is_trivial<T>::value> {}; + +/** +@private +*/ +class SmallVectorBase { +protected: + void *BeginX, *EndX, *CapacityX; + +protected: + SmallVectorBase(void *FirstEl, size_t Size) + : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {} + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize){ + size_t CurSizeBytes = size_in_bytes(); + size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow. + if (NewCapacityInBytes < MinSizeInBytes) { + NewCapacityInBytes = MinSizeInBytes; + } + + void *NewElts; + if (BeginX == FirstEl) { + NewElts = std::malloc(NewCapacityInBytes); + + // Copy the elements over. No need to run dtors on PODs. + memcpy(NewElts, this->BeginX, CurSizeBytes); + } else { + // If this wasn't grown from the inline copy, grow the allocated space. + NewElts = realloc(this->BeginX, NewCapacityInBytes); + } + //assert(NewElts && "Out of memory"); + + this->EndX = (char*)NewElts+CurSizeBytes; + this->BeginX = NewElts; + this->CapacityX = (char*)this->BeginX + NewCapacityInBytes; + } + +public: + /// This returns size()*sizeof(T). + size_t size_in_bytes() const { + return size_t((char*)EndX - (char*)BeginX); + } + + /// capacity_in_bytes - This returns capacity()*sizeof(T). + size_t capacity_in_bytes() const { + return size_t((char*)CapacityX - (char*)BeginX); + } + + bool empty() const { return BeginX == EndX; } +}; + +/** +@private +*/ +template <typename T, unsigned N> struct SmallVectorStorage; + +/** +@private +*/ +template <typename T, typename = void> +class SmallVectorTemplateCommon : public SmallVectorBase { + + private: + template <typename, unsigned> friend struct SmallVectorStorage; + + template <typename X> + struct AlignedUnionType { + alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))]; + }; + + // Allocate raw space for N elements of type T. If T has a ctor or dtor, we + // don't want it to be automatically run, so we need to represent the space as + // something else. Use an array of char of sufficient alignment. + + // deprecated in c++23 + //typedef typename std::aligned_union<1, T>::type U; + typedef AlignedUnionType<T> U; + + U FirstEl; + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + + protected: + SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} + + void grow_pod(size_t MinSizeInBytes, size_t TSize) { + SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return BeginX == static_cast<const void*>(&FirstEl); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + BeginX = EndX = CapacityX = &FirstEl; + } + + void setEnd(T *P) { this->EndX = P; } + + public: + typedef size_t size_type; + typedef ptrdiff_t difference_type; + typedef T value_type; + typedef T *iterator; + typedef const T *const_iterator; + + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + typedef std::reverse_iterator<iterator> reverse_iterator; + + typedef T &reference; + typedef const T &const_reference; + typedef T *pointer; + typedef const T *const_pointer; + + // forward iterator creation methods. + inline iterator begin() { return (iterator)this->BeginX; } + inline const_iterator begin() const { return (const_iterator)this->BeginX; } + inline iterator end() { return (iterator)this->EndX; } + inline const_iterator end() const { return (const_iterator)this->EndX; } + + protected: + + iterator capacity_ptr() { return (iterator)this->CapacityX; } + const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;} + + public: + + // reverse iterator creation methods. + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { return const_reverse_iterator(begin());} + + inline size_type size() const { return end()-begin(); } + inline size_type max_size() const { return size_type(-1) / sizeof(T); } + + /// Return the total number of elements in the currently allocated buffer. + size_t capacity() const { return capacity_ptr() - begin(); } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { return pointer(begin()); } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { return const_pointer(begin()); } + + inline reference operator[](size_type idx) { + //assert(idx < size()); + return begin()[idx]; + } + + inline const_reference operator[](size_type idx) const { + //assert(idx < size()); + return begin()[idx]; + } + + reference front() { + //assert(!empty()); + return begin()[0]; + } + + const_reference front() const { + //assert(!empty()); + return begin()[0]; + } + + reference back() { + //assert(!empty()); + return end()[-1]; + } + + const_reference back() const { + //assert(!empty()); + return end()[-1]; + } +}; + +/** +@private +*/ +template <typename T, bool isPodLike> +class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> { + +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {} + + static void destroy_range(T *S, T *E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template<typename It1, typename It2> + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(std::make_move_iterator(I), + std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template<typename It1, typename It2> + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + +public: + void push_back(const T &Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void*) this->end()) T(Elt); + this->setEnd(this->end()+1); + } + + void push_back(T &&Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void*) this->end()) T(::std::move(Elt)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + this->end()->~T(); + } +}; + +/** +@private +*/ +template <typename T, bool isPodLike> +void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) { + size_t CurCapacity = this->capacity(); + size_t CurSize = this->size(); + // Always grow, even from zero. + size_t NewCapacity = size_t(tf::detail::NextCapacity(CurCapacity+2)); + if (NewCapacity < MinSize) + NewCapacity = MinSize; + T *NewElts = static_cast<T*>(std::malloc(NewCapacity*sizeof(T))); + + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + std::free(this->begin()); + + this->setEnd(NewElts+CurSize); + this->BeginX = NewElts; + this->CapacityX = this->begin()+NewCapacity; +} + +/** +@private +*/ +template <typename T> +class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T *, T *) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template<typename It1, typename It2> + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template<typename It1, typename It2> + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template <typename T1, typename T2> + static void uninitialized_copy( + T1 *I, T1 *E, T2 *Dest, + typename std::enable_if<std::is_same<typename std::remove_const<T1>::type, + T2>::value>::type * = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(Dest, I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize*sizeof(T), sizeof(T)); + } +public: + void push_back(const T &Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + memcpy(this->end(), &Elt, sizeof(T)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + } +}; + +/** +@private +*/ +template <typename T> +class SmallVectorImpl : public SmallVectorTemplateBase<T, IsPod<T>::value> { + typedef SmallVectorTemplateBase<T, IsPod<T>::value> SuperClass; + + SmallVectorImpl(const SmallVectorImpl&) = delete; + +public: + typedef typename SuperClass::iterator iterator; + typedef typename SuperClass::const_iterator const_iterator; + typedef typename SuperClass::size_type size_type; + +protected: + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) + : SmallVectorTemplateBase<T, IsPod<T>::value>(N*sizeof(T)) { + } + +public: + ~SmallVectorImpl() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + std::free(this->begin()); + } + + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->EndX = this->BeginX; + } + + void resize(size_type N) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + for (auto I = this->end(), E = this->begin() + N; I != E; ++I) + new (&*I) T(); + this->setEnd(this->begin()+N); + } + } + + void resize(size_type N, const T &NV) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + std::uninitialized_fill(this->end(), this->begin()+N, NV); + this->setEnd(this->begin()+N); + } + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl &RHS); + + /// Add the specified range to the end of the SmallVector. + template<typename in_iter> + void append(in_iter in_start, in_iter in_end) { + size_type NumInputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->setEnd(this->end() + NumInputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type NumInputs, const T &Elt) { + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), NumInputs, Elt); + this->setEnd(this->end() + NumInputs); + } + + void append(std::initializer_list<T> IL) { + append(IL.begin(), IL.end()); + } + + void assign(size_type NumElts, const T &Elt) { + clear(); + if (this->capacity() < NumElts) + this->grow(NumElts); + this->setEnd(this->begin()+NumElts); + std::uninitialized_fill(this->begin(), this->end(), Elt); + } + + void assign(std::initializer_list<T> IL) { + clear(); + append(IL); + } + + iterator erase(const_iterator CI) { + // Just cast away constness because this is a non-const member function. + iterator I = const_cast<iterator>(CI); + + //assert(I >= this->begin() && "Iterator to erase is out of bounds."); + //assert(I < this->end() && "Erasing at past-the-end iterator."); + + iterator N = I; + // Shift all elts down one. + std::move(I+1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return(N); + } + + iterator erase(const_iterator CS, const_iterator CE) { + // Just cast away constness because this is a non-const member function. + iterator S = const_cast<iterator>(CS); + iterator E = const_cast<iterator>(CE); + + //assert(S >= this->begin() && "Range to erase is out of bounds."); + //assert(S <= E && "Trying to erase invalid range."); + //assert(E <= this->end() && "Trying to erase past the end."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->setEnd(I); + return(N); + } + + iterator insert(iterator I, T &&Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::move(Elt)); + return this->end()-1; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + + ::new ((void*) this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = ::std::move(*EltPtr); + return I; + } + + iterator insert(iterator I, const T &Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(Elt); + return this->end()-1; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + ::new ((void*) this->end()) T(std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = *EltPtr; + return I; + } + + iterator insert(iterator I, size_type NumToInsert, const T &Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin()+InsertElt; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator<iterator>(this->end() - NumToInsert), + std::move_iterator<iterator>(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::fill_n(I, NumToInsert, Elt); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, Elt); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt); + return I; + } + + template<typename ItTy> + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin()+InsertElt; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator<iterator>(this->end() - NumToInsert), + std::move_iterator<iterator>(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + for (T *J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list<T> IL) { + insert(I, IL.begin(), IL.end()); + } + + template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...); + this->setEnd(this->end() + 1); + } + + SmallVectorImpl &operator=(const SmallVectorImpl &RHS); + + SmallVectorImpl &operator=(SmallVectorImpl &&RHS); + + bool operator==(const SmallVectorImpl &RHS) const { + if (this->size() != RHS.size()) return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl &RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl &RHS) const { + return std::lexicographical_compare(this->begin(), this->end(), + RHS.begin(), RHS.end()); + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_type N) { + //assert(N <= this->capacity()); + this->setEnd(this->begin() + N); + } +}; + + +template <typename T> +void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) { + if (this == &RHS) return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->EndX, RHS.EndX); + std::swap(this->CapacityX, RHS.CapacityX); + return; + } + if (RHS.size() > this->capacity()) + this->grow(RHS.size()); + if (this->size() > RHS.capacity()) + RHS.grow(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end()); + RHS.setEnd(RHS.end()+EltDiff); + this->destroy_range(this->begin()+NumShared, this->end()); + this->setEnd(this->begin()+NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end()); + this->setEnd(this->end() + EltDiff); + this->destroy_range(RHS.begin()+NumShared, RHS.end()); + RHS.setEnd(RHS.begin()+NumShared); + } +} + +template <typename T> +SmallVectorImpl<T> &SmallVectorImpl<T>:: + operator=(const SmallVectorImpl<T> &RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->setEnd(NewEnd); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + return *this; +} + +template <typename T> +SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) std::free(this->begin()); + this->BeginX = RHS.BeginX; + this->EndX = RHS.EndX; + this->CapacityX = RHS.CapacityX; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->setEnd(NewEnd); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + + RHS.clear(); + return *this; +} + +/** +@private +*/ +template <typename T, unsigned N> +struct SmallVectorStorage { + /** + @private + */ + typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1]; +}; + +/** +@private +*/ +template <typename T> struct SmallVectorStorage<T, 1> {}; + +/** +@private +*/ +template <typename T> struct SmallVectorStorage<T, 0> {}; + +/** +@brief class to define a vector optimized for small array + +@tparam T data type +@tparam N threshold of the number of elements in the initial storage + +The class defines a C++ STL-styled vector (a variable-sized array) +optimized for the case when the array is small. +It contains some number of elements in-place, +which allows it to avoid heap allocation when the actual number of +elements is below that threshold. This allows normal @em small cases to be +fast without losing generality for large inputs. +All the methods in [std::vector](https://en.cppreference.com/w/cpp/container/vector) +can apply to this class. + +The class is stripped from the LLVM codebase. +*/ +template <typename T, unsigned N = 2> +class SmallVector : public SmallVectorImpl<T> { + /// Inline space for elements which aren't stored in the base class. + SmallVectorStorage<T, N> Storage; + +public: + + /** + @brief constructs an empty vector + */ + SmallVector() : SmallVectorImpl<T>(N) { + } + + /** + @brief constructs a vector with @c Size copies of elements with value @c value + */ + explicit SmallVector(size_t Size, const T &Value = T()) + : SmallVectorImpl<T>(N) { + this->assign(Size, Value); + } + + /** + @brief constructs a vector with the contents of the range + <tt>[S, E)</tt> + */ + template<typename ItTy> + SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) { + this->append(S, E); + } + + //template <typename RangeTy> + //explicit SmallVector(const tf::iterator_range<RangeTy> &R) + // : SmallVectorImpl<T>(N) { + // this->append(R.begin(), R.end()); + //} + + /** + @brief constructs a vector with the contents of the initializer list @c IL + */ + SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) { + this->assign(IL); + } + + /** + @brief constructs the vector with the copy of the contents of @c RHS + */ + SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) { + if (!RHS.empty()) + SmallVectorImpl<T>::operator=(RHS); + } + + /** + @brief constructs the vector with the contents of @c RHS using move semantics + */ + SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) { + if (!RHS.empty()) + SmallVectorImpl<T>::operator=(::std::move(RHS)); + } + + /** + @brief replaces the contents with a copy of the contents of @c RHS + */ + const SmallVector &operator=(const SmallVector &RHS) { + SmallVectorImpl<T>::operator=(RHS); + return *this; + } + + /** + @brief replaces the contents with the contents of @c RHS using move semantics + */ + const SmallVector &operator=(SmallVector &&RHS) { + SmallVectorImpl<T>::operator=(::std::move(RHS)); + return *this; + } + + /** + @brief constructs a vector with the contents of @c RHS using move semantics + */ + SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) { + if (!RHS.empty()) + SmallVectorImpl<T>::operator=(::std::move(RHS)); + } + + /** + @brief replaces the contents with the contents of @c RHS using move semantics + */ + const SmallVector &operator=(SmallVectorImpl<T> &&RHS) { + SmallVectorImpl<T>::operator=(::std::move(RHS)); + return *this; + } + + /** + @brief replaces the contents with the copy of the contents of an initializer list @c IL + */ + const SmallVector &operator=(std::initializer_list<T> IL) { + this->assign(IL); + return *this; + } +}; + +template<typename T, unsigned N> +static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) { + return X.capacity_in_bytes(); +} + +} // end tf namespace --------------------------------------------------------- + +namespace std { + /// Implement std::swap in terms of SmallVector swap. + template<typename T> + inline void + swap(tf::SmallVectorImpl<T> &LHS, tf::SmallVectorImpl<T> &RHS) { + LHS.swap(RHS); + } + + /// Implement std::swap in terms of SmallVector swap. + template<typename T, unsigned N> + inline void + swap(tf::SmallVector<T, N> &LHS, tf::SmallVector<T, N> &RHS) { + LHS.swap(RHS); + } +} // end of namespace std ---------------------------------------------------- + + diff --git a/myxpcs/include/taskflow_/utility/stream.hpp b/myxpcs/include/taskflow_/utility/stream.hpp new file mode 100644 index 0000000..34a86ff --- /dev/null +++ b/myxpcs/include/taskflow_/utility/stream.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include <iostream> +#include <sstream> +#include <string> + +namespace tf { + +// Procedure: ostreamize +template <typename T> +void ostreamize(std::ostream& os, T&& token) { + os << std::forward<T>(token); +} + +// Procedure: ostreamize +template <typename T, typename... Rest> +void ostreamize(std::ostream& os, T&& token, Rest&&... rest) { + os << std::forward<T>(token); + ostreamize(os, std::forward<Rest>(rest)...); +} + +// Function: stringify +template <typename... ArgsT> +std::string stringify(ArgsT&&... args) { + std::ostringstream oss; + ostreamize(oss, std::forward<ArgsT>(args)...); + return oss.str(); +} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/myxpcs/include/taskflow_/utility/traits.hpp b/myxpcs/include/taskflow_/utility/traits.hpp new file mode 100644 index 0000000..dd3953b --- /dev/null +++ b/myxpcs/include/taskflow_/utility/traits.hpp @@ -0,0 +1,303 @@ +#pragma once + +#if __has_include(<version>) +# include <version> +#endif + +#include <type_traits> +#include <iterator> +#include <iostream> +#include <fstream> +#include <mutex> +#include <stack> +#include <queue> +#include <vector> +#include <algorithm> +#include <memory> +#include <atomic> +#include <thread> +#include <future> +#include <functional> +#include <unordered_map> +#include <unordered_set> +#include <sstream> +#include <list> +#include <numeric> +#include <random> +#include <iomanip> +#include <cassert> +#include <cmath> +#include <array> +#include <string> +#include <variant> +#include <optional> +#include "os.hpp" + +namespace tf { + +//----------------------------------------------------------------------------- +// Traits +//----------------------------------------------------------------------------- + +//// Struct: dependent_false +//template <typename... T> +//struct dependent_false { +// static constexpr bool value = false; +//}; +// +//template <typename... T> +//constexpr auto dependent_false_v = dependent_false<T...>::value; + +template<typename> inline constexpr bool dependent_false_v = false; + +// ---------------------------------------------------------------------------- +// is_pod +//----------------------------------------------------------------------------- +template <typename T> +struct is_pod { + static const bool value = std::is_trivial_v<T> && + std::is_standard_layout_v<T>; +}; + +template <typename T> +constexpr bool is_pod_v = is_pod<T>::value; + +//----------------------------------------------------------------------------- +// NoInit +//----------------------------------------------------------------------------- + +template <typename T> +struct NoInit { + + //static_assert(is_pod_v<T>, "NoInit only supports POD type"); + + // constructor without initialization + NoInit () noexcept {} + + // implicit conversion T -> NoInit<T> + constexpr NoInit (T value) noexcept : v{value} {} + + // implicit conversion NoInit<T> -> T + constexpr operator T () const noexcept { return v; } + + T v; +}; + +//----------------------------------------------------------------------------- +// Move-On-Copy +//----------------------------------------------------------------------------- + +// Struct: MoveOnCopyWrapper +template <typename T> +struct MoC { + + MoC(T&& rhs) : object(std::move(rhs)) {} + MoC(const MoC& other) : object(std::move(other.object)) {} + + T& get() { return object; } + + mutable T object; +}; + +template <typename T> +auto make_moc(T&& m) { + return MoC<T>(std::forward<T>(m)); +} + +//----------------------------------------------------------------------------- +// Visitors. +//----------------------------------------------------------------------------- + +//// Overloadded. +//template <typename... Ts> +//struct Visitors : Ts... { +// using Ts::operator()... ; +//}; +// +//template <typename... Ts> +//Visitors(Ts...) -> Visitors<Ts...>; + +// ---------------------------------------------------------------------------- +// std::variant +// ---------------------------------------------------------------------------- +template <typename T, typename> +struct get_index; + +template <size_t I, typename... Ts> +struct get_index_impl {}; + +template <size_t I, typename T, typename... Ts> +struct get_index_impl<I, T, T, Ts...> : std::integral_constant<size_t, I>{}; + +template <size_t I, typename T, typename U, typename... Ts> +struct get_index_impl<I, T, U, Ts...> : get_index_impl<I+1, T, Ts...>{}; + +template <typename T, typename... Ts> +struct get_index<T, std::variant<Ts...>> : get_index_impl<0, T, Ts...>{}; + +template <typename T, typename... Ts> +constexpr auto get_index_v = get_index<T, Ts...>::value; + +// ---------------------------------------------------------------------------- +// unwrap_reference +// ---------------------------------------------------------------------------- + +template <class T> +struct unwrap_reference { using type = T; }; + +template <class U> +struct unwrap_reference<std::reference_wrapper<U>> { using type = U&; }; + +template<class T> +using unwrap_reference_t = typename unwrap_reference<T>::type; + +template< class T > +struct unwrap_ref_decay : unwrap_reference<std::decay_t<T>> {}; + +template<class T> +using unwrap_ref_decay_t = typename unwrap_ref_decay<T>::type; + +// ---------------------------------------------------------------------------- +// stateful iterators +// ---------------------------------------------------------------------------- + +// STL-styled iterator +template <typename B, typename E> +struct stateful_iterator { + + using TB = std::decay_t<unwrap_ref_decay_t<B>>; + using TE = std::decay_t<unwrap_ref_decay_t<E>>; + + static_assert(std::is_same_v<TB, TE>, "decayed iterator types must match"); + + using type = TB; +}; + +template <typename B, typename E> +using stateful_iterator_t = typename stateful_iterator<B, E>::type; + +// raw integral index +template <typename B, typename E, typename S> +struct stateful_index { + + using TB = std::decay_t<unwrap_ref_decay_t<B>>; + using TE = std::decay_t<unwrap_ref_decay_t<E>>; + using TS = std::decay_t<unwrap_ref_decay_t<S>>; + + static_assert( + std::is_integral_v<TB>, "decayed beg index must be an integral type" + ); + + static_assert( + std::is_integral_v<TE>, "decayed end index must be an integral type" + ); + + static_assert( + std::is_integral_v<TS>, "decayed step must be an integral type" + ); + + static_assert( + std::is_same_v<TB, TE> && std::is_same_v<TE, TS>, + "decayed index and step types must match" + ); + + using type = TB; +}; + +template <typename B, typename E, typename S> +using stateful_index_t = typename stateful_index<B, E, S>::type; + +// ---------------------------------------------------------------------------- +// visit a tuple with a functor at runtime +// ---------------------------------------------------------------------------- + +template <typename Func, typename Tuple, size_t N = 0> +void visit_tuple(Func func, Tuple& tup, size_t idx) { + if (N == idx) { + std::invoke(func, std::get<N>(tup)); + return; + } + if constexpr (N + 1 < std::tuple_size_v<Tuple>) { + return visit_tuple<Func, Tuple, N + 1>(func, tup, idx); + } +} + +// ---------------------------------------------------------------------------- +// unroll loop +// ---------------------------------------------------------------------------- + +// Template unrolled looping construct. +template<auto beg, auto end, auto step, bool valid = (beg < end)> +struct Unroll { + template<typename F> + static void eval(F f) { + f(beg); + Unroll<beg + step, end, step>::eval(f); + } +}; + +template<auto beg, auto end, auto step> +struct Unroll<beg, end, step, false> { + template<typename F> + static void eval(F) { } +}; + +template<auto beg, auto end, auto step, typename F> +void unroll(F f) { + Unroll<beg, end, step>::eval(f); +} + +// ---------------------------------------------------------------------------- +// make types of variant unique +// ---------------------------------------------------------------------------- + +template <typename T, typename... Ts> +struct filter_duplicates { using type = T; }; + +template <template <typename...> class C, typename... Ts, typename U, typename... Us> +struct filter_duplicates<C<Ts...>, U, Us...> + : std::conditional_t<(std::is_same_v<U, Ts> || ...) + , filter_duplicates<C<Ts...>, Us...> + , filter_duplicates<C<Ts..., U>, Us...>> {}; + +template <typename T> +struct unique_variant; + +template <typename... Ts> +struct unique_variant<std::variant<Ts...>> : filter_duplicates<std::variant<>, Ts...> {}; + +template <typename T> +using unique_variant_t = typename unique_variant<T>::type; + + +// ---------------------------------------------------------------------------- +// check if it is default compare +// ---------------------------------------------------------------------------- +template <typename T> struct is_std_compare : std::false_type { }; +template <typename T> struct is_std_compare<std::less<T>> : std::true_type { }; +template <typename T> struct is_std_compare<std::greater<T>> : std::true_type { }; + +template <typename T> +constexpr static bool is_std_compare_v = is_std_compare<T>::value; + +// ---------------------------------------------------------------------------- +// check if all types are the same +// ---------------------------------------------------------------------------- + +template<bool...> +struct bool_pack; + +template<bool... bs> +using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>; + +template <typename T, typename... Ts> +using all_same = all_true<std::is_same_v<T, Ts>...>; + +template <typename T, typename... Ts> +constexpr bool all_same_v = all_same<T, Ts...>::value; + + +} // end of namespace tf. ---------------------------------------------------- + + + diff --git a/myxpcs/include/taskflow_/utility/uuid.hpp b/myxpcs/include/taskflow_/utility/uuid.hpp new file mode 100644 index 0000000..11d7f3b --- /dev/null +++ b/myxpcs/include/taskflow_/utility/uuid.hpp @@ -0,0 +1,235 @@ +#pragma once + +#include <iostream> +#include <string> +#include <cstring> +#include <limits> +#include <random> +#include <chrono> + +namespace tf { + +// Class: UUID +// +// A universally unique identifier (UUID) is an identifier standard used in software +// construction. A UUID is simply a 128-bit value. The meaning of each bit is defined +// by any of several variants. +// For human-readable display, many systems use a canonical format using hexadecimal +// text with inserted hyphen characters. +// +// For example: 123e4567-e89b-12d3-a456-426655440000 +// +// The intent of UUIDs is to enable distributed systems to uniquely identify information +// without significant central coordination. +// +// Copyright 2006 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +struct UUID { + + using value_type = uint8_t; + using reference = uint8_t&; + using const_reference = const uint8_t&; + using iterator = uint8_t*; + using const_iterator = const uint8_t*; + using size_type = size_t; + using difference_type = ptrdiff_t; + + inline UUID(); + + UUID(const UUID&) = default; + UUID(UUID&&) = default; + + UUID& operator = (const UUID&) = default; + UUID& operator = (UUID&&) = default; + + inline static size_type size(); + inline iterator begin(); + inline const_iterator begin() const; + inline iterator end(); + inline const_iterator end() const; + + inline bool is_nil() const; + inline void swap(UUID& rhs); + inline size_t hash_value() const; + + inline bool operator == (const UUID&) const; + inline bool operator < (const UUID&) const; + inline bool operator > (const UUID&) const; + inline bool operator != (const UUID&) const; + inline bool operator >= (const UUID&) const; + inline bool operator <= (const UUID&) const; + + uint8_t data[16] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + inline std::string to_string() const; +}; + +// Constructor +inline UUID::UUID() { + + static thread_local std::default_random_engine engine { + std::random_device{}() + }; + + std::uniform_int_distribution<unsigned long> distribution( + (std::numeric_limits<unsigned long>::min)(), + (std::numeric_limits<unsigned long>::max)() + ); + + int i = 0; + auto random_value = distribution(engine); + for (auto it=begin(); it!=end(); ++it, ++i) { + if (i == sizeof(unsigned long)) { + random_value = distribution(engine); + i = 0; + } + *it = static_cast<UUID::value_type>((random_value >> (i*8)) & 0xFF); + } + + // set variant: must be 0b10xxxxxx + *(begin()+8) &= 0xBF; + *(begin()+8) |= 0x80; + + // set version: must be 0b0100xxxx + *(begin()+6) &= 0x4F; //0b01001111 + *(begin()+6) |= 0x40; //0b01000000 +} + +// Function: size +inline typename UUID::size_type UUID::size() { + return 16; +} + +// Function: begin +inline typename UUID::iterator UUID::begin() { + return data; +} + +// Function: begin +inline typename UUID::const_iterator UUID::begin() const { + return data; +} + +// Function: end +inline typename UUID::iterator UUID::end() { + return data+size(); +} + +// Function: end +inline typename UUID::const_iterator UUID::end() const { + return data+size(); +} + +// Function: is_nil +inline bool UUID::is_nil() const { + for (std::size_t i = 0; i < sizeof(this->data); ++i) { + if (this->data[i] != 0U) { + return false; + } + } + return true; +} + +// Procedure: swap +inline void UUID::swap(UUID& rhs) { + UUID tmp = *this; + *this = rhs; + rhs = tmp; +} + +// Function: hash_value +inline size_t UUID::hash_value() const { + size_t seed = 0; + for(auto i=begin(); i != end(); ++i) { + seed ^= static_cast<size_t>(*i) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; +} + +// Operator: == +inline bool UUID::operator == (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) == 0; +} + +// Operator: != +inline bool UUID::operator != (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) != 0; +} + +// Operator: < +inline bool UUID::operator < (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) < 0; +} + +// Operator: > +inline bool UUID::operator > (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) > 0; +} + +// Operator: <= +inline bool UUID::operator <= (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) <= 0; +} + +// Operator: >= +inline bool UUID::operator >= (const UUID& rhs) const { + return std::memcmp(data, rhs.data, sizeof(data)) >= 0; +} + +// Function: to_string +inline std::string UUID::to_string() const { + + auto to_char = [](size_t i) { + if (i <= 9) return static_cast<char>('0' + i); + return static_cast<char>('a' + (i-10)); + }; + + std::string result; + result.reserve(36); + + std::size_t i=0; + for (auto it = begin(); it!=end(); ++it, ++i) { + + const size_t hi = ((*it) >> 4) & 0x0F; + result += to_char(hi); + + const size_t lo = (*it) & 0x0F; + result += to_char(lo); + + if (i == 3 || i == 5 || i == 7 || i == 9) { + result += '-'; + } + } + return result; +} + +// Procedure: swap +inline void swap(UUID& lhs, UUID& rhs) { + lhs.swap(rhs); +} + +// ostream +inline std::ostream& operator << (std::ostream& os, const UUID& rhs) { + os << rhs.to_string(); + return os; +} + +} // End of namespace tf. ---------------------------------------------------- + +//----------------------------------------------------------------------------- + +namespace std { + +// Partial specialization: hash<tf::UUID> +template <> +struct hash<tf::UUID> { + size_t operator()(const tf::UUID& rhs) const { return rhs.hash_value(); } +}; + + +} // End of namespace std. --------------------------------------------------- + + diff --git a/myxpcs/source/function_call.pyx b/myxpcs/source/function_call.pyx new file mode 100644 index 0000000..93900e7 --- /dev/null +++ b/myxpcs/source/function_call.pyx @@ -0,0 +1,69 @@ +import numpy as np +cimport numpy as np # for np.ndarray + +# Numpy must be initialized. When using numpy from C or Cython you must +# _always_ do that, or you will have segfaults +np.import_array() + + +cdef extern from "numpy/arrayobject.h": + void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) + +cdef extern from "set_integer.h": + void computeXPCS(float*&, float*&) + + +# cdef int x +# set_integer_ref(x) +# return x +# +# +# cdef int[1] x +# set_integer_ptr(x) +# return x[0] +# +# +# cdef np.ndarray[int, ndim=1, mode='c'] x +# +# x = np.zeros((1,), dtype=np.int32) +# set_integer_ptr(&x[0]) +# return x[0] +# +# +# cdef int* x +# set_integer_ref_ptr(x) +# return x[0] +# +# +# cdef int* x +# set_integer_ptr_ptr(&x) +# return x[0] +# +# +# cdef np.ndarray[int, ndim=1, mode='c'] a +# +# a = np.zeros((4,), dtype=np.int32) +# set_integer_arr_ptr(&a[0]) +# return a + + +cpdef doXPCS(np.ndarray[np.float32_t, ndim=3] in_ptr, np.ndarray[np.float32_t, ndim=2] out_ptr): + #cdef: + # float* in_ptr + # float* out_ptr + # np.npy_intp shape[2] + + computeXPCS(&in_ptr[0,0,0], &out_ptr[0,0]) + + # 1. Make sure that you have called np.import_array() + # http://gael-varoquaux.info/programming/ + # cython-example-of-exposing-c-computed-arrays-in-python-without-data-copies.html + # 2. OWNDATA flag is important. It tells the NumPy to free data when the python object is deleted. + # https://stackoverflow.com/questions/23872946/force-numpy-ndarray-to-take-ownership-of-its-memory-in-cython/ + # You can verify that the memory gets freed when Python object is deleted by using tools such as pmap. + #shape[0] = <np.npy_intp>(2) + #shape[1] = <np.npy_intp>(2) + + #cdef np.ndarray[float, ndim=2] a = np.PyArray_SimpleNewFromData(2, shape, np.NPY_FLOAT, out_ptr) + #PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) + return 1 \ No newline at end of file diff --git a/myxpcs/source/set_integer.cpp b/myxpcs/source/set_integer.cpp new file mode 100644 index 0000000..948bd28 --- /dev/null +++ b/myxpcs/source/set_integer.cpp @@ -0,0 +1,30 @@ +#include <iostream> +#include <memory> + +#include <data.h> +#include <set_integer.h> + +// taskflow +#include <taskflow_/taskflow.hpp> + + +void computeXPCS(float* in, float* out) +{ + tf::Executor executor; + + + //const auto dims = (*in).shape; + const std::size_t fs = 10;//dims[0]; + const std::size_t ss = 10;// dims[1]; + const std::size_t memoryCells = 55;//dims[2]; + + auto mem = std::make_shared<Storage<float>>(std::vector<std::size_t>{fs, ss, memoryCells}); + std::cout << "blub"; + std::shared_ptr<Storage<float>> data = TranposeFromImageToTime_v3_block_tf_no_struct_one_taskflow<float>(mem, 3, 3, executor); + std::cout << "blib"; + std::cout << in[0] << "\n"; + out[0] = 1; + out[1] = 2; + out[2] = 3; + out[3] = 4; +} \ No newline at end of file -- GitLab