24 #include <cudf/utilities/export.hpp>
26 #include <thrust/host_vector.h>
32 namespace CUDF_EXPORT
cudf {
33 namespace io::parquet::experimental::detail {
38 class hybrid_scan_reader_impl;
45 namespace CUDF_EXPORT
cudf {
46 namespace io::parquet::experimental {
364 [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<byte_range_info>>
445 std::vector<rmm::device_buffer> column_chunk_buffers,
475 std::vector<rmm::device_buffer> column_chunk_buffers,
482 std::unique_ptr<detail::hybrid_scan_reader_impl> _impl;
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
The experimental parquet reader class to optimally read parquet files subject to highly selective fil...
std::unique_ptr< cudf::column > build_row_mask_with_page_index_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean column indicating which rows survive the page statistics in the page index.
std::vector< size_type > filter_row_groups_with_dictionary_pages(cudf::host_span< rmm::device_buffer > dictionary_page_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk dictionary pages.
void setup_page_index(cudf::host_span< uint8_t const > page_index_bytes) const
Setup the page index within the Parquet file metadata (FileMetaData)
std::vector< byte_range_info > payload_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of payload columns.
byte_range_info page_index_byte_range() const
Get the byte range of the page index in the Parquet file.
hybrid_scan_reader(cudf::host_span< uint8_t const > footer_bytes, parquet_reader_options const &options)
Constructor for the experimental parquet reader class to optimally read Parquet files subject to high...
std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > secondary_filters_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
table_with_metadata materialize_payload_columns(cudf::host_span< size_type const > row_group_indices, std::vector< rmm::device_buffer > column_chunk_buffers, cudf::column_view row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Materialize payload columns and applies the row mask to the output table.
FileMetaData parquet_metadata() const
Get the Parquet file footer metadata.
std::vector< size_type > filter_row_groups_with_bloom_filters(cudf::host_span< rmm::device_buffer > bloom_filter_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk bloom filters.
std::vector< size_type > filter_row_groups_with_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the input row groups using column chunk statistics.
std::vector< size_type > all_row_groups(parquet_reader_options const &options) const
Get all available row groups from the parquet file.
table_with_metadata materialize_filter_columns(cudf::host_span< size_type const > row_group_indices, std::vector< rmm::device_buffer > column_chunk_buffers, cudf::mutable_column_view row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Materializes filter columns and updates the input row mask to only the rows that exist in the output ...
size_type total_rows_in_row_groups(cudf::host_span< size_type const > row_group_indices) const
Get the total number of top-level rows in the row groups.
std::vector< byte_range_info > filter_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of filter columns.
~hybrid_scan_reader()
Destructor for the experimental parquet reader class.
Settings for read_parquet().
stores offset and size used to indicate a byte range
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
use_data_page_mask
Whether to compute and use a page mask using the row mask to skip decompression and decoding of the m...
@ YES
Compute and use a data page mask.
@ NO
Do not compute or use a data page mask.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
int32_t size_type
Row index type for columns and tables.
cuDF-IO API type definitions
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Type declarations for libcudf.