Multi-file variant of the experimental Hybrid Scan Parquet reader. More...

#include <hybrid_scan_multifile.hpp>

Public Member Functions
	hybrid_scan_multifile (cudf::host_span< cudf::host_span< uint8_t const > const > footer_bytes, parquet_reader_options const &options)
	Constructor for the multi-file experimental Parquet reader. More...

	hybrid_scan_multifile (cudf::host_span< FileMetaData const > parquet_metadata, parquet_reader_options const &options)
	Constructor for the multi-file experimental Parquet reader. More...

	~hybrid_scan_multifile ()
	Destructor for the multi-file experimental Parquet reader.

std::vector< FileMetaData >	parquet_metadatas () const
	Get parquet metadatas for all sources. More...

std::vector< byte_range_info >	page_index_byte_ranges () const
	Get byte ranges of the page index for all sources. More...

void	setup_page_indexes (cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes) const
	Setup the per-source page index within each Parquet file metadata. More...

std::vector< std::vector< size_type > >	all_row_groups (parquet_reader_options const &options) const
	Get all available per-source row group indices from the parquet files. More...

size_type	total_rows_in_row_groups (cudf::host_span< std::vector< size_type > const > row_group_indices) const
	Get the total number of top-level rows in the per-source row groups. More...

void	reset_column_selection () const
	Resets the current column selection. More...

std::vector< std::vector< size_type > >	filter_row_groups_with_byte_range (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Filter the row groups using the byte range specified by [`bytes_to_skip`, `bytes_to_skip + bytes_to_read`) More...

std::vector< std::vector< size_type > >	filter_row_groups_with_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
	Filter the input row groups using column chunk statistics. More...

std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > >	secondary_filters_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning. More...

std::unique_ptr< cudf::column >	build_all_true_row_mask (cudf::host_span< std::vector< size_type > const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Builds a boolean survival column of size equal to the total number of rows in the row groups containing all `true` values. More...

std::unique_ptr< cudf::column >	build_row_mask_with_page_index_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Builds a boolean column indicating surviving rows using page-level statistics in the page index. More...

std::pair< std::vector< byte_range_info >, std::vector< size_type > >	all_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of column chunks of all (or selected) columns. More...

table_with_metadata	materialize_all_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Materializes all (or selected) columns and returns the final output table. More...

std::vector< std::vector< std::vector< size_type > > >	construct_row_group_passes (cudf::host_span< std::vector< size_type > const > row_group_indices, std::size_t pass_read_limit) const
	Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit. More...

Detailed Description

Multi-file variant of the experimental Hybrid Scan Parquet reader.

Vectorizes hybrid_scan_reader APIs to support multiple Parquet sources. Inputs and outputs are indexed by source order except for the row mask which is a single BOOL8 column spanning all rows from all sources concatenated in source order, then row-group order within a source.

Note: Detailed usage documentation will be added once all APIs are in place. This reader will eventually move to hybrid_scan.hpp and the existing single-file reader (hybrid_scan_reader) will become its subclass. Only keeping this separate here for now to reduce noise.

Definition at line 52 of file hybrid_scan_multifile.hpp.

Constructor & Destructor Documentation

◆ hybrid_scan_multifile() [1/2]

cudf::io::parquet::experimental::hybrid_scan_multifile::hybrid_scan_multifile	(	cudf::host_span< cudf::host_span< uint8_t const > const >	footer_bytes,
		parquet_reader_options const &	options
	)

explicit

Constructor for the multi-file experimental Parquet reader.

Parameters

footer_bytes	Host span of Parquet file footer byte spans, one per source
options	Parquet reader options

◆ hybrid_scan_multifile() [2/2]

cudf::io::parquet::experimental::hybrid_scan_multifile::hybrid_scan_multifile	(	cudf::host_span< FileMetaData const >	parquet_metadata,
		parquet_reader_options const &	options
	)

explicit

Constructor for the multi-file experimental Parquet reader.

Parameters

parquet_metadata	Host span of pre-populated Parquet file metadata, one per source
options	Parquet reader options

Member Function Documentation

◆ all_column_chunks_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_column_chunks_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of column chunks of all (or selected) columns.

Parameters

row_group_indices	Input row group indices, one inner vector per source
options	Parquet reader options

Returns: Pair of flattened byte ranges to column chunks of all (or selected) columns and their corresponding source indices

◆ all_row_groups()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_row_groups ( parquet_reader_options const & options ) const

Get all available per-source row group indices from the parquet files.

Parameters

options Parquet reader options

Returns: Vector of row group indices, one inner vector per source

◆ build_all_true_row_mask()

std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_all_true_row_mask	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Builds a boolean survival column of size equal to the total number of rows in the row groups containing all true values.

Parameters

row_group_indices	Input per-source row group indices (one inner vector per source)
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: An all-true boolean (survival) column spanning all selected rows across all sources

◆ build_row_mask_with_page_index_stats()

std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_row_mask_with_page_index_stats	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Builds a boolean column indicating surviving rows using page-level statistics in the page index.

Parameters

row_group_indices	Input per-source row group indices (one inner vector per source)
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: A boolean column spanning all selected rows across all sources and indicating which filter column rows survive the statistics in the page index

◆ construct_row_group_passes()

std::vector<std::vector<std::vector<size_type> > > cudf::io::parquet::experimental::hybrid_scan_multifile::construct_row_group_passes	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		std::size_t	pass_read_limit
	)		const

Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit.

Note that the pass_read_limit is a hint, not an absolute limit - if a single row group cannot fit within the limit given, it will still constitute a pass. The compressed row group size is estimated over all columns in each row group (not just the columns selected for reading), for conservative estimates.

Exceptions

std::invalid_argument if no row group indices in the input

Parameters

row_group_indices	Input row group indices, one per source
pass_read_limit	Memory limit to read and decompress row group data, `0` if there is no limit (single pass)

Returns: Vector of per-source row group indices, one per constructed pass

◆ filter_row_groups_with_byte_range()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_byte_range	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read)

Filters the row groups such that only the row groups that start within the byte range are selected. Note that the last selected row group may end beyond the byte range.

Parameters

row_group_indices	Input row group indices, one per source
options	Parquet reader options

Returns: Filtered per-source row group indices (one inner vector per source)

◆ filter_row_groups_with_stats()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_stats	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream
	)		const

Filter the input row groups using column chunk statistics.

Parameters

row_group_indices	Input row group indices, one per source
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches

Returns: Filtered row group indices, one per source

◆ materialize_all_columns()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_all_columns	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Materializes all (or selected) columns and returns the final output table.

Parameters

row_group_indices	Input row group indices, one inner vector per source
column_chunk_data	Flattened device spans of column chunk data returned in the same order as `all_column_chunks_byte_ranges`
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table

Returns: Table of all materialized columns and metadata

◆ page_index_byte_ranges()

std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_multifile::page_index_byte_ranges ( ) const

Get byte ranges of the page index for all sources.

Returns: Vector of page index byte ranges, one per source

◆ parquet_metadatas()

std::vector<FileMetaData> cudf::io::parquet::experimental::hybrid_scan_multifile::parquet_metadatas ( ) const

Get parquet metadatas for all sources.

Returns: Vector of parquet metadata, one per source

◆ reset_column_selection()

void cudf::io::parquet::experimental::hybrid_scan_multifile::reset_column_selection ( ) const

Resets the current column selection.

Resets the current column selection state forcing column re-selection in subsequent filter, byte range, setup chunking and materialization APIs. This is useful if the filter expression has been cascaded (and-ed) to include new columns.

◆ secondary_filters_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<byte_range_info> > cudf::io::parquet::experimental::hybrid_scan_multifile::secondary_filters_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.

Note: Device buffers for bloom filter byte ranges must be allocated using a 32 byte aligned memory resource

Parameters

row_group_indices	Input row group indices, one per source
options	Parquet reader options

Returns: Pair of vectors of byte ranges of column chunk with bloom filters and dictionary pages subject to filter predicate

◆ setup_page_indexes()

void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_page_indexes ( cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes ) const

Setup the per-source page index within each Parquet file metadata.

Parameters

page_index_bytes Host span of Parquet page index buffer bytes, one per source

◆ total_rows_in_row_groups()

size_type cudf::io::parquet::experimental::hybrid_scan_multifile::total_rows_in_row_groups ( cudf::host_span< std::vector< size_type > const > row_group_indices ) const

Get the total number of top-level rows in the per-source row groups.

Parameters

row_group_indices Input per-source row group indices (one inner vector per source)

Returns: Total number of top-level rows across all sources

The documentation for this class was generated from the following file:

hybrid_scan_multifile.hpp

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ hybrid_scan_multifile() [1/2]

◆ hybrid_scan_multifile() [2/2]

Member Function Documentation

◆ all_column_chunks_byte_ranges()

◆ all_row_groups()

◆ build_all_true_row_mask()

◆ build_row_mask_with_page_index_stats()

◆ construct_row_group_passes()

◆ filter_row_groups_with_byte_range()

◆ filter_row_groups_with_stats()

◆ materialize_all_columns()

◆ page_index_byte_ranges()

◆ parquet_metadatas()

◆ reset_column_selection()

◆ secondary_filters_byte_ranges()

◆ setup_page_indexes()

◆ total_rows_in_row_groups()