Skip to main content
Schemas define the structure of Arrow data, including column names, types, and metadata. Understanding schemas is essential for working with tables and record batches.

Creating Schemas

A schema is a collection of named fields, each with a data type and optional metadata.
import pyarrow as pa

# Create a simple schema
schema = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string()),
    ('score', pa.float64())
])

# Create schema with Field objects (allows nullable specification)
schema = pa.schema([
    pa.field('id', pa.int32(), nullable=False),
    pa.field('name', pa.string(), nullable=True),
    pa.field('timestamp', pa.timestamp('ms'), nullable=False)
])

# Access schema properties
print(f"Number of fields: {len(schema)}")
print(f"Field names: {schema.names}")
print(f"Field types: {schema.types}")

# Get specific field
field = schema.field('name')
print(f"Field: {field.name}, Type: {field.type}, Nullable: {field.nullable}")

# Get field by index
field = schema.field(0)

# Check if field exists
if 'email' in schema.names:
    email_field = schema.field('email')

Schema Metadata

Schemas and fields can have custom key-value metadata attached.
import pyarrow as pa

# Create schema with metadata
schema = pa.schema([
    pa.field('id', pa.int32()),
    pa.field('value', pa.float64())
], metadata={
    'version': '1.0',
    'source': 'sensor_data',
    'created_at': '2024-01-01'
})

# Access metadata
print(f"Metadata: {schema.metadata}")
if schema.metadata:
    print(f"Version: {schema.metadata[b'version']}")

# Field-level metadata
field_with_meta = pa.field(
    'temperature', 
    pa.float64(),
    metadata={'unit': 'celsius', 'precision': '0.1'}
)

schema = pa.schema([field_with_meta])
field = schema.field('temperature')
print(f"Field metadata: {field.metadata}")

# Add or update metadata
new_metadata = {
    'version': '2.0',
    'modified': 'true'
}
schema_with_new_meta = schema.with_metadata(new_metadata)

# Remove metadata
schema_no_meta = schema.remove_metadata()
Metadata keys and values in Arrow are stored as byte strings. In Python, you may need to encode strings to bytes when setting metadata and decode when reading.

Working with Tables

Tables combine schemas with columnar data.
import pyarrow as pa

# Create schema
schema = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string()),
    ('score', pa.float64())
])

# Create table from arrays
table = pa.table({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'score': [95.5, 87.2, 92.8, 78.9]
}, schema=schema)

print(f"Schema: {table.schema}")
print(f"Shape: {table.shape}")
print(f"Columns: {table.column_names}")

# Access columns
id_column = table['id']  # By name
name_column = table.column(1)  # By index

# Get column as array
scores = table['score'].to_pylist()

# Add column to table
new_column = pa.array([True, False, True, False])
new_table = table.append_column(
    pa.field('active', pa.bool_()),
    new_column
)

# Remove column
table_no_score = table.remove_column(2)

# Select columns
subset = table.select(['id', 'name'])

# Rename columns
renamed = table.rename_columns(['user_id', 'username', 'final_score'])

Record Batches

Record batches are like tables but represent a single chunk of data with contiguous memory.
import pyarrow as pa

# Create schema
schema = pa.schema([
    ('x', pa.int32()),
    ('y', pa.float64())
])

# Create record batch
batch = pa.record_batch([
    pa.array([1, 2, 3, 4]),
    pa.array([1.1, 2.2, 3.3, 4.4])
], schema=schema)

print(f"Schema: {batch.schema}")
print(f"Num rows: {batch.num_rows}")
print(f"Num columns: {batch.num_columns}")

# Access columns
x_col = batch.column('x')
y_col = batch[1]  # By index

# Convert to table
table = pa.Table.from_batches([batch])

# Slice record batch (zero-copy)
sliced = batch.slice(1, 2)  # offset=1, length=2

# Convert to pandas
df = batch.to_pandas()

# Convert to Python dict
data_dict = batch.to_pydict()
Record batches require all arrays to have the same length. If arrays have different lengths, use a Table with chunked arrays instead.

Schema Evolution and Compatibility

Arrow provides utilities for working with evolving schemas.
import pyarrow as pa

# Original schema
schema_v1 = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string())
])

# Evolved schema (added field)
schema_v2 = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string()),
    ('email', pa.string())
])

# Check equality
print(f"Schemas equal: {schema_v1.equals(schema_v2)}")

# Check metadata equality separately
print(f"Equal ignore metadata: {schema_v1.equals(schema_v2, check_metadata=False)}")

# Unify multiple schemas (finds common schema)
schemas = [schema_v1, schema_v2]
try:
    unified = pa.unify_schemas(schemas)
    print(f"Unified schema: {unified}")
except pa.ArrowInvalid as e:
    print(f"Cannot unify: {e}")

# Check if field types are compatible
field1 = pa.field('value', pa.int32())
field2 = pa.field('value', pa.int64())

# Can cast from int32 to int64
can_cast = pa.types.is_integer(field1.type) and \
           pa.types.is_integer(field2.type)

Custom Metadata for Interoperability

Arrow uses metadata for cross-language and cross-system compatibility.
import pyarrow as pa
import json

# Pandas metadata (automatically added by from_pandas)
import pandas as pd
df = pd.DataFrame({'a': [1, 2, 3]})
table = pa.Table.from_pandas(df)

if table.schema.metadata:
    pandas_meta = table.schema.metadata.get(b'pandas')
    if pandas_meta:
        print(json.loads(pandas_meta))

# Add custom application metadata
schema = pa.schema([
    pa.field('data', pa.float64())
], metadata={
    'application': 'my_app',
    'version': '1.0',
    'data_quality': json.dumps({
        'validated': True,
        'completeness': 0.95
    })
})

# Preserve metadata when writing
import pyarrow.parquet as pq
table = pa.table({'data': [1.0, 2.0, 3.0]}, schema=schema)
pq.write_table(table, 'data.parquet')

# Read back with metadata
read_table = pq.read_table('data.parquet')
print(f"Preserved metadata: {read_table.schema.metadata}")

Best Practices

  1. Use nullable=False when appropriate: Non-nullable fields can be more efficient and document your data constraints.
  2. Add descriptive metadata: Use schema and field metadata to document units, precision, data sources, and validation rules.
  3. Version your schemas: Include version information in metadata when schemas evolve over time.
  4. Test schema compatibility: When reading data written with an older schema version, ensure compatibility before processing.
  5. Preserve metadata: When transforming tables, explicitly preserve metadata if needed:
    new_table = transform(table)
    new_table = new_table.replace_schema_metadata(table.schema.metadata)
    
Arrow schemas are immutable. Operations like adding metadata or fields create new schema objects rather than modifying existing ones.

Build docs developers (and LLMs) love