parquet.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/io/detail/parquet.hpp>
10 #include <cudf/io/types.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
15 
16 #include <memory>
17 #include <optional>
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 namespace CUDF_EXPORT cudf {
23 namespace io {
30 constexpr size_t default_row_group_size_bytes =
31  std::numeric_limits<size_t>::max();
32 constexpr size_type default_row_group_size_rows = 1'000'000;
33 constexpr size_t default_max_page_size_bytes = 512 * 1024;
35 constexpr int32_t default_column_index_truncate_length = 64;
36 constexpr size_t default_max_dictionary_size = 1024 * 1024;
38 
48 [[nodiscard]] bool is_supported_read_parquet(compression_type compression);
49 
59 [[nodiscard]] bool is_supported_write_parquet(compression_type compression);
60 
62 
67  source_info _source;
68 
69  // Path in schema of column names to read; `nullopt` is all
70  std::optional<std::vector<std::string>> _column_names;
71  // Indices of top-level columns to read; `nullopt` is all (cannot be used alongside
72  // `_column_names`)
73  std::optional<std::vector<cudf::size_type>> _column_indices;
74 
75  // List of individual row groups to read (ignored if empty)
76  std::vector<std::vector<size_type>> _row_groups;
77  // Number of rows to skip from the start; Parquet stores the number of rows as int64_t
78  int64_t _skip_rows = 0;
79  // Number of rows to read; `nullopt` is all
80  std::optional<int64_t> _num_rows;
81 
82  // Read row groups that start at or after this byte offset into the source
83  size_t _skip_bytes = 0;
84  // Read row groups that start before _num_bytes bytes after _skip_bytes into the source
85  std::optional<size_t> _num_bytes;
86 
87  // Predicate filter as AST to filter output rows.
88  std::optional<std::reference_wrapper<ast::expression const>> _filter;
89 
90  // Whether to store string data as categorical type
91  bool _convert_strings_to_categories = false;
92  // Whether to use PANDAS metadata to load columns
93  bool _use_pandas_metadata = true;
94  // Whether to read and use ARROW schema
95  bool _use_arrow_schema = true;
96  // Whether to allow reading matching select columns from mismatched Parquet files.
97  bool _allow_mismatched_pq_schemas = false;
98  // Whether to ignore non-existent projected columns
99  bool _ignore_missing_columns = true;
100  // Cast timestamp columns to a specific type
101  data_type _timestamp_type{type_id::EMPTY};
102  // Cast decimal columns to a specific width
103  type_id _decimal_width{type_id::EMPTY};
104  // Whether to use JIT compilation for filtering
105  bool _use_jit_filter = false;
106  // Whether column name matching is case sensitive. In case of multiple
107  // case-insensitive matches, the first matched column is selected
108  bool _case_sensitive_names = true;
109 
110  std::optional<std::vector<reader_column_schema>> _reader_column_schema;
111 
117  explicit parquet_reader_options(source_info src) : _source{std::move(src)} {}
118 
120 
121  public:
128  explicit parquet_reader_options() = default;
129 
138 
144  [[nodiscard]] source_info const& get_source() const { return _source; }
145 
151  [[nodiscard]] bool is_enabled_convert_strings_to_categories() const
152  {
153  return _convert_strings_to_categories;
154  }
155 
161  [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
162 
168  [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
169 
177  [[nodiscard]] bool is_enabled_allow_mismatched_pq_schemas() const
178  {
179  return _allow_mismatched_pq_schemas;
180  }
181 
189  [[nodiscard]] bool is_enabled_ignore_missing_columns() const { return _ignore_missing_columns; }
190 
196  [[nodiscard]] std::optional<std::vector<reader_column_schema>> get_column_schema() const
197  {
198  return _reader_column_schema;
199  }
200 
206  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
207 
214  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
215 
222  [[nodiscard]] size_t get_skip_bytes() const { return _skip_bytes; }
223 
230  [[nodiscard]] std::optional<size_t> const& get_num_bytes() const { return _num_bytes; }
231 
237  [[nodiscard]] [[deprecated("Use `get_column_names` instead.")]] auto const& get_columns() const
238  {
239  return _column_names;
240  }
241 
247  [[nodiscard]] auto const& get_column_names() const { return _column_names; }
248 
254  [[nodiscard]] auto const& get_column_indices() const { return _column_indices; }
255 
261  [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
262 
268  [[nodiscard]] auto const& get_filter() const { return _filter; }
269 
275  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
276 
282  [[nodiscard]] type_id get_decimal_width() const { return _decimal_width; }
283 
289  [[nodiscard]] bool is_enabled_use_jit_filter() const { return _use_jit_filter; }
290 
299  [[nodiscard]] bool is_enabled_case_sensitive_names() const { return _case_sensitive_names; }
300 
306  void set_source(source_info src) { _source = std::move(src); }
307 
330  [[deprecated("Use `set_column_names` instead.")]] void set_columns(
331  std::vector<std::string> column_names)
332  {
333  set_column_names(std::move(column_names));
334  }
335 
356  void set_column_names(std::vector<std::string> column_names)
357  {
358  CUDF_EXPECTS(not _column_indices.has_value(),
359  "Cannot select columns by indices and names simultaneously");
360  _column_names = std::move(column_names);
361  }
362 
374  void set_column_indices(std::vector<cudf::size_type> col_indices)
375  {
376  CUDF_EXPECTS(not _column_names.has_value(),
377  "Cannot select columns by indices and names simultaneously");
378  _column_indices = std::move(col_indices);
379  }
380 
407  void set_row_groups(std::vector<std::vector<size_type>> row_groups);
408 
439  void set_filter(ast::expression const& filter) { _filter = filter; }
440 
446  void enable_convert_strings_to_categories(bool val) { _convert_strings_to_categories = val; }
447 
453  void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
454 
460  void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
461 
469  void enable_allow_mismatched_pq_schemas(bool val) { _allow_mismatched_pq_schemas = val; }
470 
477  void enable_ignore_missing_columns(bool val) { _ignore_missing_columns = val; }
478 
485  void set_column_schema(std::vector<reader_column_schema> val)
486  {
487  _reader_column_schema = std::move(val);
488  }
489 
495  void set_skip_rows(int64_t val);
496 
505  void set_num_rows(int64_t val);
506 
512  void set_skip_bytes(size_t val);
513 
519  void set_num_bytes(size_t val);
520 
526  void set_timestamp_type(data_type type) { _timestamp_type = type; }
527 
534  void set_decimal_width(type_id width) { _decimal_width = width; }
535 
544  void enable_case_sensitive_names(bool val) { _case_sensitive_names = val; }
545 };
546 
551  parquet_reader_options options;
552 
553  public:
561 
567  explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {}
568 
577  [[deprecated("Use `column_names` instead.")]] parquet_reader_options_builder& columns(
578  std::vector<std::string> column_names)
579  {
580  return this->column_names(std::move(column_names));
581  }
582 
589  parquet_reader_options_builder& column_names(std::vector<std::string> column_names)
590  {
591  options.set_column_names(std::move(column_names));
592  return *this;
593  }
594 
601  parquet_reader_options_builder& column_indices(std::vector<cudf::size_type> col_indices)
602  {
603  options.set_column_indices(std::move(col_indices));
604  return *this;
605  }
606 
611  parquet_reader_options_builder& row_groups(std::vector<std::vector<size_type>> row_groups)
612  {
613  options.set_row_groups(std::move(row_groups));
614  return *this;
615  }
616 
622  {
623  options.set_filter(filter);
624  return *this;
625  }
626 
634  {
635  options._convert_strings_to_categories = val;
636  return *this;
637  }
638 
646  {
647  options._use_pandas_metadata = val;
648  return *this;
649  }
650 
658  {
659  options._use_arrow_schema = val;
660  return *this;
661  }
662 
673  {
674  options._allow_mismatched_pq_schemas = val;
675  return *this;
676  }
677 
686  {
687  options._ignore_missing_columns = val;
688  return *this;
689  }
690 
697  parquet_reader_options_builder& set_column_schema(std::vector<reader_column_schema> val)
698  {
699  options._reader_column_schema = std::move(val);
700  return *this;
701  }
702 
710  {
711  options.set_skip_rows(val);
712  return *this;
713  }
714 
725  {
726  options.set_num_rows(val);
727  return *this;
728  }
729 
737  {
738  options.set_skip_bytes(val);
739  return *this;
740  }
741 
749  {
750  options.set_num_bytes(val);
751  return *this;
752  }
753 
761  {
762  options._timestamp_type = type;
763  return *this;
764  }
765 
774  {
775  options._decimal_width = width;
776  return *this;
777  }
778 
786  {
787  options._use_jit_filter = use_jit_filter;
788  return *this;
789  }
790 
801  {
802  options._case_sensitive_names = val;
803  return *this;
804  }
805 
809  operator parquet_reader_options&&() { return std::move(options); }
810 
818  parquet_reader_options&& build() { return std::move(options); }
819 };
820 
842  parquet_reader_options const& options,
845 
871  std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
872  std::vector<parquet::FileMetaData>&& parquet_metadatas,
873  parquet_reader_options const& options,
876 
887  public:
895 
910  std::size_t chunk_read_limit,
911  parquet_reader_options const& options,
914 
932  std::size_t chunk_read_limit,
933  std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
934  std::vector<parquet::FileMetaData>&& parquet_metadatas,
935  parquet_reader_options const& options,
938 
959  std::size_t chunk_read_limit,
960  std::size_t pass_read_limit,
961  parquet_reader_options const& options,
964 
988  std::size_t chunk_read_limit,
989  std::size_t pass_read_limit,
990  std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
991  std::vector<parquet::FileMetaData>&& parquet_metadatas,
992  parquet_reader_options const& options,
995 
1004 
1010  [[nodiscard]] bool has_next() const;
1011 
1023  [[nodiscard]] table_with_metadata read_chunk() const;
1024 
1025  private:
1026  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
1027 };
1028  // end of group
1040  int column_idx{};
1041  bool is_descending{false};
1042  bool is_nulls_first{true};
1043 };
1044 
1049  // Specify the sink to use for writer output
1050  sink_info _sink;
1051  // Specify the compression format to use
1052  compression_type _compression = compression_type::SNAPPY;
1053  // Specify the level of statistics in the output file
1055  // Optional associated metadata
1056  std::optional<table_input_metadata> _metadata;
1057  // Optional footer key_value_metadata
1058  std::vector<std::map<std::string, std::string>> _user_data;
1059  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
1060  // If true then overrides any per-column setting in _metadata.
1061  bool _write_timestamps_as_int96 = false;
1062  // Parquet writer can write timestamps as UTC
1063  // Defaults to true because libcudf timestamps are implicitly UTC
1064  bool _write_timestamps_as_UTC = true;
1065  // Whether to write ARROW schema
1066  bool _write_arrow_schema = false;
1067  // Maximum size of each row group (unless smaller than a single page)
1068  size_t _row_group_size_bytes = default_row_group_size_bytes;
1069  // Maximum number of rows in row group (unless smaller than a single page)
1070  size_type _row_group_size_rows = default_row_group_size_rows;
1071  // Maximum size of each page (uncompressed)
1072  size_t _max_page_size_bytes = default_max_page_size_bytes;
1073  // Maximum number of rows in a page
1074  size_type _max_page_size_rows = default_max_page_size_rows;
1075  // Maximum size of min or max values in column index
1076  int32_t _column_index_truncate_length = default_column_index_truncate_length;
1077  // When to use dictionary encoding for data
1078  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
1079  // Maximum size of column chunk dictionary (in bytes)
1080  size_t _max_dictionary_size = default_max_dictionary_size;
1081  // Maximum number of rows in a page fragment
1082  std::optional<size_type> _max_page_fragment_size;
1083  // Optional compression statistics
1084  std::shared_ptr<writer_compression_statistics> _compression_stats;
1085  // write V2 page headers?
1086  bool _v2_page_headers = false;
1087  // enable per-page compression decision for V2?
1088  bool _page_level_compression = false;
1089  // Which columns in _table are used for sorting
1090  std::optional<std::vector<sorting_column>> _sorting_columns;
1091 
1092  protected:
1098  explicit parquet_writer_options_base(sink_info sink) : _sink(std::move(sink)) {}
1099 
1100  public:
1107 
1113  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1114 
1120  [[nodiscard]] compression_type get_compression() const { return _compression; }
1121 
1127  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
1128 
1134  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1135 
1141  [[nodiscard]] std::vector<std::map<std::string, std::string>> const& get_key_value_metadata()
1142  const
1143  {
1144  return _user_data;
1145  }
1146 
1152  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
1153 
1159  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
1160 
1166  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
1167 
1173  [[nodiscard]] auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
1174 
1180  [[nodiscard]] auto get_row_group_size_rows() const { return _row_group_size_rows; }
1181 
1189  [[nodiscard]] auto get_max_page_size_bytes() const
1190  {
1191  return std::min(_max_page_size_bytes, get_row_group_size_bytes());
1192  }
1193 
1201  [[nodiscard]] auto get_max_page_size_rows() const
1202  {
1203  return std::min(_max_page_size_rows, get_row_group_size_rows());
1204  }
1205 
1211  [[nodiscard]] auto get_column_index_truncate_length() const
1212  {
1213  return _column_index_truncate_length;
1214  }
1215 
1221  [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; }
1222 
1228  [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; }
1229 
1235  [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
1236 
1242  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1243  {
1244  return _compression_stats;
1245  }
1246 
1252  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
1253 
1263  [[nodiscard]] auto is_enabled_page_level_compression() const { return _page_level_compression; }
1264 
1270  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
1271 
1278 
1284  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
1285 
1298 
1305  void enable_int96_timestamps(bool req);
1306 
1312  void enable_utc_timestamps(bool val);
1313 
1320 
1326  void set_row_group_size_bytes(size_t size_bytes);
1327 
1334 
1340  void set_max_page_size_bytes(size_t size_bytes);
1341 
1348 
1354  void set_column_index_truncate_length(int32_t size_bytes);
1355 
1362 
1368  void set_max_dictionary_size(size_t size_bytes);
1369 
1376 
1382  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats);
1383 
1389  void enable_write_v2_headers(bool val);
1390 
1401 
1407  void set_sorting_columns(std::vector<sorting_column> sorting_columns);
1408 };
1409 
1413 template <class BuilderT, class OptionsT>
1415  OptionsT _options;
1416 
1417  protected:
1423  inline OptionsT& get_options() { return _options; }
1424 
1430  explicit parquet_writer_options_builder_base(OptionsT options);
1431 
1432  public:
1439 
1446  BuilderT& metadata(table_input_metadata metadata);
1447 
1454  BuilderT& key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
1455 
1463 
1470  BuilderT& compression(compression_type compression);
1471 
1478  BuilderT& row_group_size_bytes(size_t val);
1479 
1487 
1498  BuilderT& max_page_size_bytes(size_t val);
1499 
1508 
1522  BuilderT& column_index_truncate_length(int32_t val);
1523 
1542 
1554  BuilderT& max_dictionary_size(size_t val);
1555 
1567 
1575  std::shared_ptr<writer_compression_statistics> const& comp_stats);
1576 
1583  BuilderT& int96_timestamps(bool enabled);
1584 
1591  BuilderT& utc_timestamps(bool enabled);
1592 
1599  BuilderT& write_arrow_schema(bool enabled);
1600 
1607  BuilderT& write_v2_headers(bool enabled);
1608 
1619  BuilderT& page_level_compression(bool enabled);
1620 
1627  BuilderT& sorting_columns(std::vector<sorting_column> sorting_columns);
1628 
1632  operator OptionsT&&();
1633 
1641  OptionsT&& build();
1642 };
1643 
1645 
1650  // Sets of columns to output
1651  table_view _table;
1652  // Partitions described as {start_row, num_rows} pairs
1653  std::vector<partition_info> _partitions;
1654  // Column chunks file paths to be set in the raw output metadata. One per output file
1655  std::vector<std::string> _column_chunks_file_paths;
1656 
1658 
1665  explicit parquet_writer_options(sink_info const& sink, table_view table);
1666 
1667  public:
1674 
1684 
1691 
1697  [[nodiscard]] table_view get_table() const { return _table; }
1698 
1704  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
1705 
1711  [[nodiscard]] std::vector<std::string> const& get_column_chunks_file_paths() const
1712  {
1713  return _column_chunks_file_paths;
1714  }
1715 
1722  void set_partitions(std::vector<partition_info> partitions);
1723 
1730  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
1731 };
1732 
1737  : public parquet_writer_options_builder_base<parquet_writer_options_builder,
1738  parquet_writer_options> {
1739  public:
1745  explicit parquet_writer_options_builder() = default;
1746 
1754 
1762  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions);
1763 
1771  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths);
1772 };
1773 
1792 std::unique_ptr<std::vector<uint8_t>> write_parquet(
1794 
1804 std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
1805  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
1806 
1808 
1819 
1821 
1822  public:
1829 
1838 };
1839 
1844  : public parquet_writer_options_builder_base<chunked_parquet_writer_options_builder,
1845  chunked_parquet_writer_options> {
1846  public:
1853 
1860 };
1861 
1882  public:
1889 
1903 
1919  std::vector<partition_info> const& partitions = {});
1920 
1930  std::unique_ptr<std::vector<uint8_t>> close(
1931  std::vector<std::string> const& column_chunks_file_path = {});
1932 
1934  std::unique_ptr<parquet::detail::writer> writer;
1935 };
1936  // end of group
1938 
1939 } // namespace io
1940 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:278
The chunked parquet reader class to read Parquet file iteratively in to a series of tables,...
Definition: parquet.hpp:886
table_with_metadata read_chunk() const
Read a chunk of rows in the given Parquet file.
bool has_next() const
Check if there is any data in the given file has not yet read.
chunked_parquet_reader(std::size_t chunk_read_limit, std::vector< std::unique_ptr< cudf::io::datasource >> &&sources, std::vector< parquet::FileMetaData > &&parquet_metadatas, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader using pre-existing Parquet datasources and file metadatas.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, std::vector< std::unique_ptr< cudf::io::datasource >> &&sources, std::vector< parquet::FileMetaData > &&parquet_metadatas, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader using pre-existing Parquet datasources and file metadatas.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance.
chunked_parquet_reader()
Default constructor, this should never be used.
Class to build chunked_parquet_writer_options.
Definition: parquet.hpp:1845
chunked_parquet_writer_options_builder()=default
Default constructor.
chunked_parquet_writer_options_builder(sink_info const &sink)
Constructor from sink.
Settings for chunked_parquet_writer.
Definition: parquet.hpp:1812
static chunked_parquet_writer_options_builder builder(sink_info const &sink)
creates builder to build chunked_parquet_writer_options.
chunked_parquet_writer_options()=default
Default constructor.
chunked parquet writer class to handle options and write tables in chunks.
Definition: parquet.hpp:1881
~chunked_parquet_writer()
Default destructor. This is added to not leak detail API.
chunked_parquet_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< std::vector< uint8_t > > close(std::vector< std::string > const &column_chunks_file_path={})
Finishes the chunked/streamed write process.
std::unique_ptr< parquet::detail::writer > writer
Unique pointer to impl writer class.
Definition: parquet.hpp:1934
chunked_parquet_writer & write(table_view const &table, std::vector< partition_info > const &partitions={})
Writes table to output.
chunked_parquet_writer()
Default constructor, this should never be used. This is added just to satisfy cython....
Builds parquet_reader_options to use for read_parquet().
Definition: parquet.hpp:550
parquet_reader_options_builder & num_bytes(size_t val)
Sets number of bytes after skipping to end reading row groups at.
Definition: parquet.hpp:748
parquet_reader_options_builder & use_arrow_schema(bool val)
Sets to enable/disable use of arrow schema to read.
Definition: parquet.hpp:657
parquet_reader_options_builder(source_info src)
Constructor from source info.
Definition: parquet.hpp:567
parquet_reader_options_builder & decimal_width(type_id width)
Sets the decimal width used to cast decimal columns.
Definition: parquet.hpp:773
parquet_reader_options_builder & skip_rows(int64_t val)
Sets number of rows to skip.
Definition: parquet.hpp:709
parquet_reader_options_builder & allow_mismatched_pq_schemas(bool val)
Sets to enable/disable reading of matching projected and filter columns from mismatched Parquet sourc...
Definition: parquet.hpp:672
parquet_reader_options_builder & column_names(std::vector< std::string > column_names)
Sets names of the columns to be read.
Definition: parquet.hpp:589
parquet_reader_options_builder & ignore_missing_columns(bool val)
Sets to enable/disable ignoring of non-existent projected columns while reading.
Definition: parquet.hpp:685
parquet_reader_options_builder & skip_bytes(size_t val)
Sets bytes to skip before starting reading row groups.
Definition: parquet.hpp:736
parquet_reader_options_builder & timestamp_type(data_type type)
timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:760
parquet_reader_options_builder & use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:645
parquet_reader_options_builder()=default
Default constructor.
parquet_reader_options_builder & num_rows(int64_t val)
Sets number of rows to read.
Definition: parquet.hpp:724
parquet_reader_options_builder & row_groups(std::vector< std::vector< size_type >> row_groups)
Specifies which row groups to read from each input source.
Definition: parquet.hpp:611
parquet_reader_options_builder & set_column_schema(std::vector< reader_column_schema > val)
Sets reader metadata.
Definition: parquet.hpp:697
parquet_reader_options_builder & columns(std::vector< std::string > column_names)
Sets names of the columns to be read.
Definition: parquet.hpp:577
parquet_reader_options_builder & column_indices(std::vector< cudf::size_type > col_indices)
Sets the indices of top-level columns to be read from all input sources.
Definition: parquet.hpp:601
parquet_reader_options && build()
move parquet_reader_options member once it's built.
Definition: parquet.hpp:818
parquet_reader_options_builder & filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
Definition: parquet.hpp:621
parquet_reader_options_builder & case_sensitive_names(bool val)
Sets whether column name matching is case sensitive.
Definition: parquet.hpp:800
parquet_reader_options_builder & use_jit_filter(bool use_jit_filter)
Enable/disable use of JIT for filter step.
Definition: parquet.hpp:785
parquet_reader_options_builder & convert_strings_to_categories(bool val)
Sets enable/disable conversion of strings to categories.
Definition: parquet.hpp:633
Settings for read_parquet().
Definition: parquet.hpp:66
data_type get_timestamp_type() const
Returns timestamp type used to cast timestamp columns.
Definition: parquet.hpp:275
parquet_reader_options()=default
Default constructor.
void enable_allow_mismatched_pq_schemas(bool val)
Sets to enable/disable reading of matching projected and filter columns from mismatched Parquet sourc...
Definition: parquet.hpp:469
void set_skip_rows(int64_t val)
Sets number of rows to skip.
bool is_enabled_use_jit_filter() const
Returns whether to use JIT compilation for filtering.
Definition: parquet.hpp:289
size_t get_skip_bytes() const
Returns bytes to skip before starting reading row groups.
Definition: parquet.hpp:222
bool is_enabled_ignore_missing_columns() const
Returns boolean depending on whether to ignore non-existent projected columns while reading.
Definition: parquet.hpp:189
static parquet_reader_options_builder builder(source_info src=source_info{})
Creates a parquet_reader_options_builder to build parquet_reader_options. By default,...
void enable_convert_strings_to_categories(bool val)
Sets to enable/disable conversion of strings to categories.
Definition: parquet.hpp:446
std::optional< std::vector< reader_column_schema > > get_column_schema() const
Returns optional tree of metadata.
Definition: parquet.hpp:196
void set_skip_bytes(size_t val)
Sets bytes to skip before starting reading row groups.
type_id get_decimal_width() const
Returns decimal width used to cast decimal columns.
Definition: parquet.hpp:282
void set_column_indices(std::vector< cudf::size_type > col_indices)
Sets the indices of top-level columns to be read from all input sources.
Definition: parquet.hpp:374
source_info const & get_source() const
Returns source info.
Definition: parquet.hpp:144
auto const & get_column_indices() const
Returns indices of top-level columns to be read, if set.
Definition: parquet.hpp:254
auto const & get_row_groups() const
Returns list of individual row groups to be read.
Definition: parquet.hpp:261
void set_decimal_width(type_id width)
Sets decimal width used to cast decimal columns.
Definition: parquet.hpp:534
void set_row_groups(std::vector< std::vector< size_type >> row_groups)
Specifies which row groups to read from each input source.
void enable_ignore_missing_columns(bool val)
Sets to enable/disable ignoring of non-existent projected columns while reading.
Definition: parquet.hpp:477
void set_source(source_info src)
Set a new source location.
Definition: parquet.hpp:306
auto const & get_columns() const
Returns names of column to be read, if set.
Definition: parquet.hpp:237
void set_timestamp_type(data_type type)
Sets timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:526
void set_column_names(std::vector< std::string > column_names)
Sets the names of columns to be read from all input sources.
Definition: parquet.hpp:356
std::optional< int64_t > const & get_num_rows() const
Returns number of rows to read.
Definition: parquet.hpp:214
bool is_enabled_convert_strings_to_categories() const
Returns boolean depending on whether strings should be converted to categories.
Definition: parquet.hpp:151
void set_columns(std::vector< std::string > column_names)
Sets the names of columns to be read from all input sources.
Definition: parquet.hpp:330
void set_num_rows(int64_t val)
Sets number of rows to read.
void enable_case_sensitive_names(bool val)
Sets whether column name matching is case sensitive.
Definition: parquet.hpp:544
void set_num_bytes(size_t val)
Sets number of bytes after skipping to end reading row groups at.
void enable_use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:453
void enable_use_arrow_schema(bool val)
Sets to enable/disable use of arrow schema to read.
Definition: parquet.hpp:460
bool is_enabled_use_pandas_metadata() const
Returns boolean depending on whether to use pandas metadata while reading.
Definition: parquet.hpp:161
bool is_enabled_allow_mismatched_pq_schemas() const
Returns boolean depending on whether to read matching projected and filter columns from mismatched Pa...
Definition: parquet.hpp:177
void set_column_schema(std::vector< reader_column_schema > val)
Sets reader column schema.
Definition: parquet.hpp:485
bool is_enabled_use_arrow_schema() const
Returns boolean depending on whether to use arrow schema while reading.
Definition: parquet.hpp:168
void set_filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
Definition: parquet.hpp:439
auto const & get_filter() const
Returns AST based filter for predicate pushdown.
Definition: parquet.hpp:268
std::optional< size_t > const & get_num_bytes() const
Returns number of bytes after skipping to end reading row groups at.
Definition: parquet.hpp:230
auto const & get_column_names() const
Returns names of column to be read, if set.
Definition: parquet.hpp:247
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: parquet.hpp:206
bool is_enabled_case_sensitive_names() const
Returns whether column name matching is case sensitive.
Definition: parquet.hpp:299
Base settings for write_parquet() and chunked_parquet_writer.
Definition: parquet.hpp:1048
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
auto const & get_sorting_columns() const
Returns the sorting_columns.
Definition: parquet.hpp:1270
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
Definition: parquet.hpp:1173
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
Definition: parquet.hpp:1152
void set_metadata(table_input_metadata metadata)
Sets metadata.
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
parquet_writer_options_base(sink_info sink)
Constructor from sink.
Definition: parquet.hpp:1098
void set_stats_level(statistics_freq sf)
Sets the level of statistics.
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
Definition: parquet.hpp:1180
parquet_writer_options_base()=default
Default constructor.
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
void set_sorting_columns(std::vector< sorting_column > sorting_columns)
Sets sorting columns.
auto is_enabled_write_arrow_schema() const
Returns true if arrow schema will be written.
Definition: parquet.hpp:1166
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
Definition: parquet.hpp:1252
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
auto get_max_page_size_bytes() const
Returns the maximum uncompressed page size, in bytes.
Definition: parquet.hpp:1189
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
compression_type get_compression() const
Returns compression format used.
Definition: parquet.hpp:1120
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
Definition: parquet.hpp:1228
void set_compression(compression_type compression)
Sets compression type.
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
Definition: parquet.hpp:1221
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: parquet.hpp:1242
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
void enable_page_level_compression(bool val)
Sets preference for per-page compression decision in V2 pages.
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
Definition: parquet.hpp:1235
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets metadata.
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
void enable_write_arrow_schema(bool val)
Sets preference for writing arrow schema. Write arrow schema if set to true.
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
Definition: parquet.hpp:1159
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
auto is_enabled_page_level_compression() const
Returns true if per-page compression is enabled for V2 pages.
Definition: parquet.hpp:1263
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences. INT96 timestamps will be written if true and TIMESTAMP_MICROS wil...
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
Definition: parquet.hpp:1127
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: parquet.hpp:1141
auto const & get_metadata() const
Returns associated metadata.
Definition: parquet.hpp:1134
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
Definition: parquet.hpp:1201
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
Definition: parquet.hpp:1211
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
sink_info const & get_sink() const
Returns sink info.
Definition: parquet.hpp:1113
Base class for Parquet options builders.
Definition: parquet.hpp:1414
BuilderT & compression(compression_type compression)
Sets compression type.
BuilderT & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata.
OptionsT & get_options()
Return reference to the options object being built.
Definition: parquet.hpp:1423
BuilderT & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
BuilderT & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
BuilderT & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
OptionsT && build()
move options member once it's built.
BuilderT & stats_level(statistics_freq sf)
Sets the level of statistics.
BuilderT & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
BuilderT & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
BuilderT & metadata(table_input_metadata metadata)
Sets metadata.
BuilderT & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
parquet_writer_options_builder_base(OptionsT options)
Constructor from options.
BuilderT & page_level_compression(bool enabled)
Set to true to enable per-page compression decisions for V2 pages.
BuilderT & int96_timestamps(bool enabled)
Sets whether int96 timestamps are written or not.
BuilderT & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
BuilderT & sorting_columns(std::vector< sorting_column > sorting_columns)
Sets column sorting metadata.
BuilderT & write_arrow_schema(bool enabled)
Set to true if arrow schema is to be written.
parquet_writer_options_builder_base()=default
Default constructor.
BuilderT & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
BuilderT & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
BuilderT & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
BuilderT & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
Class to build parquet_writer_options.
Definition: parquet.hpp:1738
parquet_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
parquet_writer_options_builder()=default
Default constructor.
parquet_writer_options_builder & partitions(std::vector< partition_info > partitions)
Sets partitions in parquet_writer_options.
parquet_writer_options_builder & column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
Settings for write_parquet().
Definition: parquet.hpp:1649
void set_partitions(std::vector< partition_info > partitions)
Sets partitions.
static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create parquet_writer_options.
parquet_writer_options()=default
Default constructor.
std::vector< std::string > const & get_column_chunks_file_paths() const
Returns Column chunks file paths to be set in the raw output metadata.
Definition: parquet.hpp:1711
table_view get_table() const
Returns table_view.
Definition: parquet.hpp:1697
void set_column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
static parquet_writer_options_builder builder()
Create builder to create parquet_writer_options.
std::vector< partition_info > const & get_partitions() const
Returns partitions.
Definition: parquet.hpp:1704
Metadata for a table.
Definition: io/types.hpp:893
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_parquet(std::vector< std::unique_ptr< cudf::io::datasource >> &&sources, std::vector< parquet::FileMetaData > &&parquet_metadatas, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a Parquet dataset into a set of columns using pre-existing Parquet datasources and file metadat...
constexpr size_type default_row_group_size_rows
1 million rows per row group
Definition: parquet.hpp:32
constexpr int32_t default_column_index_truncate_length
truncate to 64 bytes
Definition: parquet.hpp:35
constexpr size_t default_row_group_size_bytes
Infinite bytes per row group.
Definition: parquet.hpp:30
bool is_supported_write_parquet(compression_type compression)
Check if the compression type is supported for writing Parquet files.
constexpr size_type default_max_page_fragment_size
5000 rows per page fragment
Definition: parquet.hpp:37
constexpr size_t default_max_dictionary_size
1MB dictionary size
Definition: parquet.hpp:36
bool is_supported_read_parquet(compression_type compression)
Check if the compression type is supported for reading Parquet files.
constexpr size_t default_max_page_size_bytes
512KB per page
Definition: parquet.hpp:33
constexpr size_type default_max_page_size_rows
20k rows per page
Definition: parquet.hpp:34
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:85
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:214
compression_type
Compression algorithms.
Definition: io/types.hpp:46
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:87
@ ADAPTIVE
Use dictionary when it will not impact compression.
Definition: io/types.hpp:216
std::unique_ptr< std::vector< uint8_t > > merge_row_group_metadata(std::vector< std::unique_ptr< std::vector< uint8_t >>> const &metadata_list)
Merges multiple raw metadata blobs that were previously created by write_parquet into a single metada...
std::unique_ptr< std::vector< uint8_t > > write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to parquet format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::resource_ref< cuda::mr::device_accessible > device_async_resource_ref
std::vector< std::unique_ptr< column > > filter(std::vector< column_view > const &predicate_columns, std::string const &predicate_udf, std::vector< column_view > const &filter_columns, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, output_nullability predicate_nullability=output_nullability::PRESERVE, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a filter function against every element of the input columns.
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:145
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
type_id
Identifies a column's logical element type.
Definition: types.hpp:193
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:62
Destination information for write interfaces.
Definition: io/types.hpp:471
Struct used to describe column sorting metadata.
Definition: parquet.hpp:1039
Source information for read interfaces.
Definition: io/types.hpp:316
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.