fix: ensure SchemaField.field_dtype returns a string · googleapis/python-bigquery@adff18d · GitHub | Latest TMZ Celebrity News & Gossip | Watch TMZ Live
Skip to content

Commit adff18d

Browse files
committed
fix: ensure SchemaField.field_dtype returns a string
1 parent 5805066 commit adff18d

File tree

4 files changed

+103
-181
lines changed

4 files changed

+103
-181
lines changed

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 66 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -508,133 +508,103 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
508508
bq_schema_unused = set()
509509

510510
bq_schema_out = []
511-
unknown_type_fields = []
512-
511+
unknown_type_columns = []
512+
dataframe_reset_index = dataframe.reset_index()
513513
for column, dtype in list_columns_and_indexes(dataframe):
514-
# Use provided type from schema, if present.
514+
# Step 1: use provided type from schema, if present.
515515
bq_field = bq_schema_index.get(column)
516516
if bq_field:
517517
bq_schema_out.append(bq_field)
518518
bq_schema_unused.discard(bq_field.name)
519519
continue
520520

521-
# Otherwise, try to automatically determine the type based on the
521+
# Step 2: try to automatically determine the type based on the
522522
# pandas dtype.
523+
bq_mode = "NULLABLE"
523524
bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
524525
if bq_type is None:
525-
sample_data = _first_valid(dataframe.reset_index()[column])
526+
sample_data = _first_valid(dataframe_reset_index[column])
526527
if (
527528
isinstance(sample_data, _BaseGeometry)
528529
and sample_data is not None # Paranoia
529530
):
530531
bq_type = "GEOGRAPHY"
531-
bq_field = schema.SchemaField(column, bq_type)
532-
bq_schema_out.append(bq_field)
533-
534-
if bq_field.field_type is None:
535-
unknown_type_fields.append(bq_field)
532+
if bq_type is not None:
533+
bq_schema_out.append(schema.SchemaField(column, bq_type))
534+
continue
536535

537-
# Catch any schema mismatch. The developer explicitly asked to serialize a
538-
# column, but it was not found.
539-
if bq_schema_unused:
540-
raise ValueError(
541-
"bq_schema contains fields not present in dataframe: {}".format(
542-
bq_schema_unused
543-
)
544-
)
536+
# Step 3: try with pyarrow if available
537+
bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column])
538+
if bq_field is not None:
539+
bq_schema_out.append(bq_field)
540+
continue
545541

546-
# If schema detection was not successful for all columns, also try with
547-
# pyarrow, if available.
548-
if unknown_type_fields:
549-
if not pyarrow:
550-
msg = "Could not determine the type of columns: {}".format(
551-
", ".join(field.name for field in unknown_type_fields)
552-
)
553-
warnings.warn(msg)
554-
return None # We cannot detect the schema in full.
542+
unknown_type_columns.append(column)
555543

556-
# The augment_schema() helper itself will also issue unknown type
557-
# warnings if detection still fails for any of the fields.
558-
bq_schema_out = augment_schema(dataframe, bq_schema_out)
544+
if unknown_type_columns != []:
545+
msg = "Could not determine the type of columns: {}".format(
546+
", ".join(unknown_type_columns)
547+
)
548+
warnings.warn(msg)
549+
return None # We cannot detect the schema in full.
559550

560-
return tuple(bq_schema_out) if bq_schema_out else None
551+
return tuple(bq_schema_out)
561552

562553

563-
def augment_schema(dataframe, current_bq_schema):
564-
"""Try to deduce the unknown field types and return an improved schema.
554+
def _get_schema_by_pyarrow(name, series):
555+
"""Attempt to detect the type of the given series by leveraging PyArrow's
556+
type detection capabilities.
565557
566-
This function requires ``pyarrow`` to run. If all the missing types still
567-
cannot be detected, ``None`` is returned. If all types are already known,
568-
a shallow copy of the given schema is returned.
558+
This function requires the ``pyarrow`` library to be installed and
559+
available. If the series type cannot be determined or ``pyarrow`` is not
560+
available, ``None`` is returned.
569561
570562
Args:
571-
dataframe (pandas.DataFrame):
572-
DataFrame for which some of the field types are still unknown.
573-
current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
574-
A BigQuery schema for ``dataframe``. The types of some or all of
575-
the fields may be ``None``.
563+
name (str):
564+
the column name of the SchemaField.
565+
series (pandas.Series):
566+
The Series data for which to detect the data type.
576567
Returns:
577-
Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]
568+
Optional[google.cloud.bigquery.schema.SchemaField]:
569+
A tuple containing the BigQuery-compatible type string (e.g.,
570+
"STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC")
571+
and the mode string ("NULLABLE", "REPEATED").
572+
Returns ``None`` if the type cannot be determined or ``pyarrow``
573+
is not imported.
578574
"""
579-
# pytype: disable=attribute-error
580-
augmented_schema = []
581-
unknown_type_fields = []
582-
for field in current_bq_schema:
583-
if field.field_type is not None:
584-
augmented_schema.append(field)
585-
continue
586-
587-
arrow_table = pyarrow.array(dataframe.reset_index()[field.name])
588575

589-
if pyarrow.types.is_list(arrow_table.type):
590-
# `pyarrow.ListType`
591-
detected_mode = "REPEATED"
592-
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
593-
arrow_table.values.type.id
594-
)
595-
596-
# For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
597-
# it to such datetimes, causing them to be recognized as TIMESTAMP type.
598-
# We thus additionally check the actual data to see if we need to overrule
599-
# that and choose DATETIME instead.
600-
# Note that this should only be needed for datetime values inside a list,
601-
# since scalar datetime values have a proper Pandas dtype that allows
602-
# distinguishing between timezone-naive and timezone-aware values before
603-
# even requiring the additional schema augment logic in this method.
604-
if detected_type == "TIMESTAMP":
605-
valid_item = _first_array_valid(dataframe[field.name])
606-
if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
607-
detected_type = "DATETIME"
608-
else:
609-
detected_mode = field.mode
610-
detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
611-
if detected_type == "NUMERIC" and arrow_table.type.scale > 9:
612-
detected_type = "BIGNUMERIC"
613-
614-
if detected_type is None:
615-
unknown_type_fields.append(field)
616-
continue
576+
if not pyarrow:
577+
return None
617578

618-
new_field = schema.SchemaField(
619-
name=field.name,
620-
field_type=detected_type,
621-
mode=detected_mode,
622-
description=field.description,
623-
fields=field.fields,
624-
)
625-
augmented_schema.append(new_field)
579+
arrow_table = pyarrow.array(series)
580+
if pyarrow.types.is_list(arrow_table.type):
581+
# `pyarrow.ListType`
582+
mode = "REPEATED"
583+
type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id)
584+
585+
# For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
586+
# it to such datetimes, causing them to be recognized as TIMESTAMP type.
587+
# We thus additionally check the actual data to see if we need to overrule
588+
# that and choose DATETIME instead.
589+
# Note that this should only be needed for datetime values inside a list,
590+
# since scalar datetime values have a proper Pandas dtype that allows
591+
# distinguishing between timezone-naive and timezone-aware values before
592+
# even requiring the additional schema augment logic in this method.
593+
if type == "TIMESTAMP":
594+
valid_item = _first_array_valid(series)
595+
if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
596+
type = "DATETIME"
597+
else:
598+
mode = "NULLABLE" # default mode
599+
type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
600+
if type == "NUMERIC" and arrow_table.type.scale > 9:
601+
type = "BIGNUMERIC"
626602

627-
if unknown_type_fields:
628-
warnings.warn(
629-
"Pyarrow could not determine the type of columns: {}.".format(
630-
", ".join(field.name for field in unknown_type_fields)
631-
)
632-
)
603+
if type is not None:
604+
return schema.SchemaField(name, type, mode)
605+
else:
633606
return None
634607

635-
return augmented_schema
636-
# pytype: enable=attribute-error
637-
638608

639609
def dataframe_to_arrow(dataframe, bq_schema):
640610
"""Convert pandas dataframe to Arrow table, using BigQuery schema.

google/cloud/bigquery/schema.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,15 +284,13 @@ def name(self):
284284
return self._properties.get("name", "")
285285

286286
@property
287-
def field_type(self):
287+
def field_type(self) -> str:
288288
"""str: The type of the field.
289289
290290
See:
291291
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
292292
"""
293293
type_ = self._properties.get("type")
294-
if type_ is None: # Shouldn't happen, but some unit tests do this.
295-
return None
296294
return cast(str, type_).upper()
297295

298296
@property

tests/unit/test__pandas_helpers.py

Lines changed: 36 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1568,31 +1568,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test):
15681568
# set to "datetime64[ns]", and pyarrow converts that to pyarrow.TimestampArray.
15691569
# We thus cannot expect to get a DATETIME date when converting back to the
15701570
# BigQuery type.
1571-
1572-
current_schema = (
1573-
schema.SchemaField("bool_field", field_type=None, mode="NULLABLE"),
1574-
schema.SchemaField("int_field", field_type=None, mode="NULLABLE"),
1575-
schema.SchemaField("float_field", field_type=None, mode="NULLABLE"),
1576-
schema.SchemaField("time_field", field_type=None, mode="NULLABLE"),
1577-
schema.SchemaField("timestamp_field", field_type=None, mode="NULLABLE"),
1578-
schema.SchemaField("date_field", field_type=None, mode="NULLABLE"),
1579-
schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"),
1580-
schema.SchemaField("string_field", field_type=None, mode="NULLABLE"),
1581-
schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"),
1582-
schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"),
1583-
)
1584-
1585-
with warnings.catch_warnings(record=True) as warned:
1586-
augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
1587-
1588-
# there should be no relevant warnings
1589-
unwanted_warnings = [
1590-
warning for warning in warned if "Pyarrow could not" in str(warning)
1591-
]
1592-
assert not unwanted_warnings
1593-
1594-
# the augmented schema must match the expected
1595-
expected_schema = (
1571+
expected_schemas = (
15961572
schema.SchemaField("bool_field", field_type="BOOL", mode="NULLABLE"),
15971573
schema.SchemaField("int_field", field_type="INT64", mode="NULLABLE"),
15981574
schema.SchemaField("float_field", field_type="FLOAT64", mode="NULLABLE"),
@@ -1607,8 +1583,13 @@ def test_augment_schema_type_detection_succeeds(module_under_test):
16071583
),
16081584
)
16091585

1610-
by_name = operator.attrgetter("name")
1611-
assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name)
1586+
for col_name, expected_schema in zip(dataframe, expected_schemas):
1587+
with warnings.catch_warnings(record=True) as warned:
1588+
schema_field = module_under_test._get_schema_by_pyarrow(
1589+
col_name, dataframe[col_name]
1590+
)
1591+
assert warned == []
1592+
assert schema_field == expected_schema
16121593

16131594

16141595
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1639,30 +1620,20 @@ def test_augment_schema_repeated_fields(module_under_test):
16391620
]
16401621
)
16411622

1642-
current_schema = (
1643-
schema.SchemaField("string_array", field_type=None, mode="NULLABLE"),
1644-
schema.SchemaField("timestamp_array", field_type=None, mode="NULLABLE"),
1645-
schema.SchemaField("datetime_array", field_type=None, mode="NULLABLE"),
1646-
)
1647-
1648-
with warnings.catch_warnings(record=True) as warned:
1649-
augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
1650-
1651-
# there should be no relevant warnings
1652-
unwanted_warnings = [
1653-
warning for warning in warned if "Pyarrow could not" in str(warning)
1654-
]
1655-
assert not unwanted_warnings
1656-
16571623
# the augmented schema must match the expected
1658-
expected_schema = (
1624+
expected_schemas = (
16591625
schema.SchemaField("string_array", field_type="STRING", mode="REPEATED"),
16601626
schema.SchemaField("timestamp_array", field_type="TIMESTAMP", mode="REPEATED"),
16611627
schema.SchemaField("datetime_array", field_type="DATETIME", mode="REPEATED"),
16621628
)
16631629

1664-
by_name = operator.attrgetter("name")
1665-
assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name)
1630+
for col_name, expected_schema in zip(dataframe, expected_schemas):
1631+
with warnings.catch_warnings(record=True) as warned:
1632+
schema_field = module_under_test._get_schema_by_pyarrow(
1633+
col_name, dataframe[col_name]
1634+
)
1635+
assert warned == []
1636+
assert schema_field == expected_schema
16661637

16671638

16681639
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1681,48 +1652,36 @@ def test_augment_schema_type_detection_fails(module_under_test):
16811652
},
16821653
]
16831654
)
1684-
current_schema = [
1685-
schema.SchemaField("status", field_type="STRING", mode="NULLABLE"),
1686-
schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"),
1687-
schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"),
1688-
]
1689-
1690-
with warnings.catch_warnings(record=True) as warned:
1691-
augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
16921655

1693-
assert augmented_schema is None
1656+
expected_schemas = (
1657+
schema.SchemaField("status", field_type="STRING", mode="NULLABLE"),
1658+
# Could not determine the type of these columns
1659+
None,
1660+
None,
1661+
)
16941662

1695-
expected_warnings = [
1696-
warning for warning in warned if "could not determine" in str(warning)
1697-
]
1698-
assert len(expected_warnings) == 1
1699-
warning_msg = str(expected_warnings[0])
1700-
assert "pyarrow" in warning_msg.lower()
1701-
assert "struct_field" in warning_msg and "struct_field_2" in warning_msg
1663+
for col_name, expected_schema in zip(dataframe, expected_schemas):
1664+
with warnings.catch_warnings(record=True) as warned:
1665+
schema_field = module_under_test._get_schema_by_pyarrow(
1666+
col_name, dataframe[col_name]
1667+
)
1668+
assert warned == []
1669+
assert schema_field == expected_schema
17021670

17031671

17041672
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
17051673
def test_augment_schema_type_detection_fails_array_data(module_under_test):
17061674
dataframe = pandas.DataFrame(
17071675
data=[{"all_none_array": [None, float("NaN")], "empty_array": []}]
17081676
)
1709-
current_schema = [
1710-
schema.SchemaField("all_none_array", field_type=None, mode="NULLABLE"),
1711-
schema.SchemaField("empty_array", field_type=None, mode="NULLABLE"),
1712-
]
1713-
1714-
with warnings.catch_warnings(record=True) as warned:
1715-
augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
17161677

1717-
assert augmented_schema is None
1718-
1719-
expected_warnings = [
1720-
warning for warning in warned if "could not determine" in str(warning)
1721-
]
1722-
assert len(expected_warnings) == 1
1723-
warning_msg = str(expected_warnings[0])
1724-
assert "pyarrow" in warning_msg.lower()
1725-
assert "all_none_array" in warning_msg and "empty_array" in warning_msg
1678+
for col_name in dataframe:
1679+
with warnings.catch_warnings(record=True) as warned:
1680+
schema_field = module_under_test._get_schema_by_pyarrow(
1681+
col_name, dataframe[col_name]
1682+
)
1683+
assert warned == []
1684+
assert schema_field == None
17261685

17271686

17281687
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")

tests/unit/test_schema.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -640,11 +640,6 @@ def test___repr__(self):
640640
expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, None, (), None)"
641641
self.assertEqual(repr(field1), expected)
642642

643-
def test___repr__type_not_set(self):
644-
field1 = self._make_one("field1", field_type=None)
645-
expected = "SchemaField('field1', None, 'NULLABLE', None, None, (), None)"
646-
self.assertEqual(repr(field1), expected)
647-
648643
def test___repr__evaluable_no_policy_tags(self):
649644
field = self._make_one("field1", "STRING", "REQUIRED", "Description")
650645
field_repr = repr(field)

0 commit comments

Comments
 (0)

TMZ Celebrity News – Breaking Stories, Videos & Gossip

Looking for the latest TMZ celebrity news? You've come to the right place. From shocking Hollywood scandals to exclusive videos, TMZ delivers it all in real time.

Whether it’s a red carpet slip-up, a viral paparazzi moment, or a legal drama involving your favorite stars, TMZ news is always first to break the story. Stay in the loop with daily updates, insider tips, and jaw-dropping photos.

🎥 Watch TMZ Live

TMZ Live brings you daily celebrity news and interviews straight from the TMZ newsroom. Don’t miss a beat—watch now and see what’s trending in Hollywood.