fix: ensure SchemaField.field_dtype returns a string by chelsea-lin · Pull Request #2188 · googleapis/python-bigquery · GitHub | Latest TMZ Celebrity News & Gossip | Watch TMZ Live
Skip to content

fix: ensure SchemaField.field_dtype returns a string #2188

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 66 additions & 88 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,31 +508,37 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
bq_schema_unused = set()

bq_schema_out = []
unknown_type_fields = []

unknown_type_columns = []
dataframe_reset_index = dataframe.reset_index()
for column, dtype in list_columns_and_indexes(dataframe):
# Use provided type from schema, if present.
# Step 1: use provided type from schema, if present.
bq_field = bq_schema_index.get(column)
if bq_field:
bq_schema_out.append(bq_field)
bq_schema_unused.discard(bq_field.name)
continue

# Otherwise, try to automatically determine the type based on the
# Step 2: try to automatically determine the type based on the
# pandas dtype.
bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
if bq_type is None:
sample_data = _first_valid(dataframe.reset_index()[column])
sample_data = _first_valid(dataframe_reset_index[column])
if (
isinstance(sample_data, _BaseGeometry)
and sample_data is not None # Paranoia
):
bq_type = "GEOGRAPHY"
bq_field = schema.SchemaField(column, bq_type)
bq_schema_out.append(bq_field)
if bq_type is not None:
bq_schema_out.append(schema.SchemaField(column, bq_type))
continue

# Step 3: try with pyarrow if available
bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column])
if bq_field is not None:
bq_schema_out.append(bq_field)
continue

if bq_field.field_type is None:
unknown_type_fields.append(bq_field)
unknown_type_columns.append(column)

# Catch any schema mismatch. The developer explicitly asked to serialize a
# column, but it was not found.
Expand All @@ -543,98 +549,70 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
)
)

# If schema detection was not successful for all columns, also try with
# pyarrow, if available.
if unknown_type_fields:
if not pyarrow:
msg = "Could not determine the type of columns: {}".format(
", ".join(field.name for field in unknown_type_fields)
)
warnings.warn(msg)
return None # We cannot detect the schema in full.

# The augment_schema() helper itself will also issue unknown type
# warnings if detection still fails for any of the fields.
bq_schema_out = augment_schema(dataframe, bq_schema_out)
if unknown_type_columns != []:
msg = "Could not determine the type of columns: {}".format(
", ".join(unknown_type_columns)
)
warnings.warn(msg)
return None # We cannot detect the schema in full.

return tuple(bq_schema_out) if bq_schema_out else None
return tuple(bq_schema_out)


def augment_schema(dataframe, current_bq_schema):
"""Try to deduce the unknown field types and return an improved schema.
def _get_schema_by_pyarrow(name, series):
"""Attempt to detect the type of the given series by leveraging PyArrow's
type detection capabilities.

This function requires ``pyarrow`` to run. If all the missing types still
cannot be detected, ``None`` is returned. If all types are already known,
a shallow copy of the given schema is returned.
This function requires the ``pyarrow`` library to be installed and
available. If the series type cannot be determined or ``pyarrow`` is not
available, ``None`` is returned.

Args:
dataframe (pandas.DataFrame):
DataFrame for which some of the field types are still unknown.
current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
A BigQuery schema for ``dataframe``. The types of some or all of
the fields may be ``None``.
name (str):
the column name of the SchemaField.
series (pandas.Series):
The Series data for which to detect the data type.
Returns:
Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]
Optional[google.cloud.bigquery.schema.SchemaField]:
A tuple containing the BigQuery-compatible type string (e.g.,
"STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC")
and the mode string ("NULLABLE", "REPEATED").
Returns ``None`` if the type cannot be determined or ``pyarrow``
is not imported.
"""
# pytype: disable=attribute-error
augmented_schema = []
unknown_type_fields = []
for field in current_bq_schema:
if field.field_type is not None:
augmented_schema.append(field)
continue

arrow_table = pyarrow.array(dataframe.reset_index()[field.name])

if pyarrow.types.is_list(arrow_table.type):
# `pyarrow.ListType`
detected_mode = "REPEATED"
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
arrow_table.values.type.id
)

# For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
# it to such datetimes, causing them to be recognized as TIMESTAMP type.
# We thus additionally check the actual data to see if we need to overrule
# that and choose DATETIME instead.
# Note that this should only be needed for datetime values inside a list,
# since scalar datetime values have a proper Pandas dtype that allows
# distinguishing between timezone-naive and timezone-aware values before
# even requiring the additional schema augment logic in this method.
if detected_type == "TIMESTAMP":
valid_item = _first_array_valid(dataframe[field.name])
if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
detected_type = "DATETIME"
else:
detected_mode = field.mode
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
if detected_type == "NUMERIC" and arrow_table.type.scale > 9:
detected_type = "BIGNUMERIC"

if detected_type is None:
unknown_type_fields.append(field)
continue
if not pyarrow:
return None

new_field = schema.SchemaField(
name=field.name,
field_type=detected_type,
mode=detected_mode,
description=field.description,
fields=field.fields,
)
augmented_schema.append(new_field)
arrow_table = pyarrow.array(series)
if pyarrow.types.is_list(arrow_table.type):
# `pyarrow.ListType`
mode = "REPEATED"
type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id)

# For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
# it to such datetimes, causing them to be recognized as TIMESTAMP type.
# We thus additionally check the actual data to see if we need to overrule
# that and choose DATETIME instead.
# Note that this should only be needed for datetime values inside a list,
# since scalar datetime values have a proper Pandas dtype that allows
# distinguishing between timezone-naive and timezone-aware values before
# even requiring the additional schema augment logic in this method.
if type == "TIMESTAMP":
valid_item = _first_array_valid(series)
if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
type = "DATETIME"
else:
mode = "NULLABLE" # default mode
type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
if type == "NUMERIC" and arrow_table.type.scale > 9:
type = "BIGNUMERIC"

if unknown_type_fields:
warnings.warn(
"Pyarrow could not determine the type of columns: {}.".format(
", ".join(field.name for field in unknown_type_fields)
)
)
if type is not None:
return schema.SchemaField(name, type, mode)
else:
return None

return augmented_schema
# pytype: enable=attribute-error


def dataframe_to_arrow(dataframe, bq_schema):
"""Convert pandas dataframe to Arrow table, using BigQuery schema.
Expand Down
28 changes: 11 additions & 17 deletions google/cloud/bigquery/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,15 +284,13 @@ def name(self):
return self._properties.get("name", "")

@property
def field_type(self):
def field_type(self) -> str:
"""str: The type of the field.

See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
"""
type_ = self._properties.get("type")
if type_ is None: # Shouldn't happen, but some unit tests do this.
return None
return cast(str, type_).upper()

@property
Expand Down Expand Up @@ -397,20 +395,16 @@ def _key(self):
Returns:
Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`.
"""
field_type = self.field_type.upper() if self.field_type is not None else None

# Type can temporarily be set to None if the code needs a SchemaField instance,
# but has not determined the exact type of the field yet.
if field_type is not None:
if field_type == "STRING" or field_type == "BYTES":
if self.max_length is not None:
field_type = f"{field_type}({self.max_length})"
elif field_type.endswith("NUMERIC"):
if self.precision is not None:
if self.scale is not None:
field_type = f"{field_type}({self.precision}, {self.scale})"
else:
field_type = f"{field_type}({self.precision})"
field_type = self.field_type
if field_type == "STRING" or field_type == "BYTES":
if self.max_length is not None:
field_type = f"{field_type}({self.max_length})"
elif field_type.endswith("NUMERIC"):
if self.precision is not None:
if self.scale is not None:
field_type = f"{field_type}({self.precision}, {self.scale})"
else:
field_type = f"{field_type}({self.precision})"

policy_tags = (
None if self.policy_tags is None else tuple(sorted(self.policy_tags.names))
Expand Down
Loading

TMZ Celebrity News – Breaking Stories, Videos & Gossip

Looking for the latest TMZ celebrity news? You've come to the right place. From shocking Hollywood scandals to exclusive videos, TMZ delivers it all in real time.

Whether it’s a red carpet slip-up, a viral paparazzi moment, or a legal drama involving your favorite stars, TMZ news is always first to break the story. Stay in the loop with daily updates, insider tips, and jaw-dropping photos.

🎥 Watch TMZ Live

TMZ Live brings you daily celebrity news and interviews straight from the TMZ newsroom. Don’t miss a beat—watch now and see what’s trending in Hollywood.