sort_merge_join.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/column/column.hpp>
10 #include <cudf/join/join.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
15 
16 #include <rmm/cuda_stream_view.hpp>
17 
18 #include <thrust/iterator/counting_iterator.h>
19 
20 #include <optional>
21 #include <variant>
22 
23 namespace CUDF_EXPORT cudf {
24 
35  public:
36  sort_merge_join() = delete;
37  sort_merge_join(sort_merge_join const&) = delete;
38  sort_merge_join(sort_merge_join&&) = delete;
39  sort_merge_join& operator=(sort_merge_join const&) = delete;
40  sort_merge_join& operator=(sort_merge_join&&) = delete;
41 
56  sorted is_right_sorted,
57  null_equality compare_nulls = null_equality::EQUAL,
59 
75  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
76  std::unique_ptr<rmm::device_uvector<size_type>>>
77  inner_join(table_view const& left,
78  sorted is_left_sorted,
81 
96  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
97  std::unique_ptr<rmm::device_uvector<size_type>>>
98  left_join(table_view const& left,
99  sorted is_left_sorted,
102 
126  table_view const& left,
127  sorted is_left_sorted,
130 
177  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
178  std::unique_ptr<rmm::device_uvector<size_type>>>
180  cudf::join_partition_context const& context,
183 
184  private:
188  struct preprocessed_table {
189  table_view _table_view;
190 
191  table_view
192  _null_processed_table_view;
195 
196  std::optional<rmm::device_buffer> _validity_mask =
197  std::nullopt;
198  std::optional<size_type> _num_nulls =
199  std::nullopt;
200  std::optional<std::unique_ptr<table>> _null_processed_table =
201  std::nullopt;
202 
203  std::optional<std::unique_ptr<column>> _null_processed_table_sorted_order =
204  std::nullopt;
205 
212  void populate_nonnull_filter(rmm::cuda_stream_view stream);
213 
219  void apply_nonnull_filter(rmm::cuda_stream_view stream);
220 
226  void preprocess_unprocessed_table(rmm::cuda_stream_view stream);
227 
233  void compute_sorted_order(rmm::cuda_stream_view stream);
234 
242  rmm::device_uvector<size_type> map_table_to_unprocessed(rmm::cuda_stream_view stream);
243  };
244  preprocessed_table preprocessed_left;
245  preprocessed_table preprocessed_right;
246  null_equality compare_nulls;
247 
256  void postprocess_indices(device_span<size_type> smaller_indices,
257  device_span<size_type> larger_indices,
258  rmm::cuda_stream_view stream);
259 
284  template <typename MergeOperation>
285  auto invoke_merge(table_view right_view,
286  table_view left_view,
287  MergeOperation&& op,
288  rmm::cuda_stream_view stream);
289 };
290 
327 [[deprecated]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
328  std::unique_ptr<rmm::device_uvector<size_type>>>
330  cudf::table_view const& right_keys,
331  null_equality compare_nulls = null_equality::EQUAL,
334 
372 [[deprecated]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
373  std::unique_ptr<rmm::device_uvector<size_type>>>
375  cudf::table_view const& right_keys,
376  null_equality compare_nulls = null_equality::EQUAL,
379  // end of group
381 } // namespace CUDF_EXPORT cudf
Class that implements sort-merge algorithm for table joins.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the row indices that can be used to construct the result of performing an inner join between ...
cudf::join_match_context inner_join_match_context(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns context information about matches between the left and right tables.
sort_merge_join(table_view const &right, sorted is_right_sorted, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a sort-merge join object that pre-processes the right table on creation,...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > partitioned_inner_join(cudf::join_partition_context const &context, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs an inner join between a partition of the left table and the right table.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the row indices that can be used to construct the result of performing a left join between th...
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
Class definition for cudf::column.
column view class definitions
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > sort_merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:140
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:156
cuDF interfaces
Definition: host_udf.hpp:26
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:323
Holds context information about matches between tables during a join operation.
Definition: join.hpp:64
Stores context information for partitioned join operations.
Definition: join.hpp:81
Class definitions for (mutable)_table_view
Type declarations for libcudf.