import random import timeit import pandas as pd import polars as pl # Create a DataFrame with 50,000 columns and 1 row num_cols = 50_000 data = {f"col_{i}": [random.random()] for i in range(num_cols)} pd_df = pd.DataFrame(data) pl_df = pl.DataFrame(data) # Method 1: Us...
collect() # Load a DataFrame with specific columns from a parquet file df = kagglehub.dataset_load( KaggleDatasetAdapter.POLARS, "robikscube/textocr-text-extraction-from-images-dataset", "annot.parquet", polars_frame_type=PolarsFrameType.DATA_FRAME, polars_kwargs={"columns": ["image_id", ...
Modern columnar data format for ML. Convert from Parquet in 2-lines of code for 100x faster random access, zero-cost schema evolution, rich secondary indices, versioning, and more. Compatible with Pandas, DuckDB, Polars, Pyarrow, and Ray with more integrations on the way. Documentation • Bl...
Modern columnar data format for ML. Convert from Parquet in 2-lines of code for 100x faster random access, a vector index, data versioning, and more. Compatible with pandas, DuckDB, Polars, and pyarrow with more integrations on the way. Documentation • Blog • Discord • Twitter Lance...