fix: ensure SchemaField.field_dtype returns a string

chelsea-lin · chelsea-lin · commit adff18d7bb0d · 2025-05-16T18:58:10.000Z
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -508,133 +508,103 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
         bq_schema_unused = set()
 
     bq_schema_out = []
-    unknown_type_fields = []
-
+    unknown_type_columns = []
+    dataframe_reset_index = dataframe.reset_index()
     for column, dtype in list_columns_and_indexes(dataframe):
-        # Use provided type from schema, if present.
+        # Step 1: use provided type from schema, if present.
         bq_field = bq_schema_index.get(column)
         if bq_field:
             bq_schema_out.append(bq_field)
             bq_schema_unused.discard(bq_field.name)
             continue
 
-        # Otherwise, try to automatically determine the type based on the
+        # Step 2: try to automatically determine the type based on the
         # pandas dtype.
+        bq_mode = "NULLABLE"
         bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
         if bq_type is None:
-            sample_data = _first_valid(dataframe.reset_index()[column])
+            sample_data = _first_valid(dataframe_reset_index[column])
             if (
                 isinstance(sample_data, _BaseGeometry)
                 and sample_data is not None  # Paranoia
             ):
                 bq_type = "GEOGRAPHY"
-        bq_field = schema.SchemaField(column, bq_type)
-        bq_schema_out.append(bq_field)
-
-        if bq_field.field_type is None:
-            unknown_type_fields.append(bq_field)
+        if bq_type is not None:
+            bq_schema_out.append(schema.SchemaField(column, bq_type))
+            continue
 
-    # Catch any schema mismatch. The developer explicitly asked to serialize a
-    # column, but it was not found.
-    if bq_schema_unused:
-        raise ValueError(
-            "bq_schema contains fields not present in dataframe: {}".format(
-                bq_schema_unused
-            )
-        )
+        # Step 3: try with pyarrow if available
+        bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column])
+        if bq_field is not None:
+            bq_schema_out.append(bq_field)
+            continue
 
-    # If schema detection was not successful for all columns, also try with
-    # pyarrow, if available.
-    if unknown_type_fields:
-        if not pyarrow:
-            msg = "Could not determine the type of columns: {}".format(
-                ", ".join(field.name for field in unknown_type_fields)
-            )
-            warnings.warn(msg)
-            return None  # We cannot detect the schema in full.
+        unknown_type_columns.append(column)
 
-        # The augment_schema() helper itself will also issue unknown type
-        # warnings if detection still fails for any of the fields.
-        bq_schema_out = augment_schema(dataframe, bq_schema_out)
+    if unknown_type_columns != []:
+        msg = "Could not determine the type of columns: {}".format(
+            ", ".join(unknown_type_columns)
+        )
+        warnings.warn(msg)
+        return None  # We cannot detect the schema in full.
 
-    return tuple(bq_schema_out) if bq_schema_out else None
+    return tuple(bq_schema_out)
 
 
-def augment_schema(dataframe, current_bq_schema):
-    """Try to deduce the unknown field types and return an improved schema.
+def _get_schema_by_pyarrow(name, series):
+    """Attempt to detect the type of the given series by leveraging PyArrow's
+    type detection capabilities.
 
-    This function requires ``pyarrow`` to run. If all the missing types still
-    cannot be detected, ``None`` is returned. If all types are already known,
-    a shallow copy of the given schema is returned.
+    This function requires the ``pyarrow`` library to be installed and
+    available. If the series type cannot be determined or ``pyarrow`` is not
+    available, ``None`` is returned.
 
     Args:
-        dataframe (pandas.DataFrame):
-            DataFrame for which some of the field types are still unknown.
-        current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
-            A BigQuery schema for ``dataframe``. The types of some or all of
-            the fields may be ``None``.
+        name (str):
+            the column name of the SchemaField.
+        series (pandas.Series):
+            The Series data for which to detect the data type.
     Returns:
-        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]
+        Optional[google.cloud.bigquery.schema.SchemaField]:
+            A tuple containing the BigQuery-compatible type string (e.g.,
+            "STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC")
+            and the mode string ("NULLABLE", "REPEATED").
+            Returns ``None`` if the type cannot be determined or ``pyarrow``
+            is not imported.
     """
-    # pytype: disable=attribute-error
-    augmented_schema = []
-    unknown_type_fields = []
-    for field in current_bq_schema:
-        if field.field_type is not None:
-            augmented_schema.append(field)
-            continue
-
-        arrow_table = pyarrow.array(dataframe.reset_index()[field.name])
 
-        if pyarrow.types.is_list(arrow_table.type):
-            # `pyarrow.ListType`
-            detected_mode = "REPEATED"
-            detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
-                arrow_table.values.type.id
-            )
-
-            # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
-            # it to such datetimes, causing them to be recognized as TIMESTAMP type.
-            # We thus additionally check the actual data to see if we need to overrule
-            # that and choose DATETIME instead.
-            # Note that this should only be needed for datetime values inside a list,
-            # since scalar datetime values have a proper Pandas dtype that allows
-            # distinguishing between timezone-naive and timezone-aware values before
-            # even requiring the additional schema augment logic in this method.
-            if detected_type == "TIMESTAMP":
-                valid_item = _first_array_valid(dataframe[field.name])
-                if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
-                    detected_type = "DATETIME"
-        else:
-            detected_mode = field.mode
-            detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
-            if detected_type == "NUMERIC" and arrow_table.type.scale > 9:
-                detected_type = "BIGNUMERIC"
-
-        if detected_type is None:
-            unknown_type_fields.append(field)
-            continue
+    if not pyarrow:
+        return None
 
-        new_field = schema.SchemaField(
-            name=field.name,
-            field_type=detected_type,
-            mode=detected_mode,
-            description=field.description,
-            fields=field.fields,
-        )
-        augmented_schema.append(new_field)
+    arrow_table = pyarrow.array(series)
+    if pyarrow.types.is_list(arrow_table.type):
+        # `pyarrow.ListType`
+        mode = "REPEATED"
+        type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id)
+
+        # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
+        # it to such datetimes, causing them to be recognized as TIMESTAMP type.
+        # We thus additionally check the actual data to see if we need to overrule
+        # that and choose DATETIME instead.
+        # Note that this should only be needed for datetime values inside a list,
+        # since scalar datetime values have a proper Pandas dtype that allows
+        # distinguishing between timezone-naive and timezone-aware values before
+        # even requiring the additional schema augment logic in this method.
+        if type == "TIMESTAMP":
+            valid_item = _first_array_valid(series)
+            if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
+                type = "DATETIME"
+    else:
+        mode = "NULLABLE"  # default mode
+        type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
+        if type == "NUMERIC" and arrow_table.type.scale > 9:
+            type = "BIGNUMERIC"
 
-    if unknown_type_fields:
-        warnings.warn(
-            "Pyarrow could not determine the type of columns: {}.".format(
-                ", ".join(field.name for field in unknown_type_fields)
-            )
-        )
+    if type is not None:
+        return schema.SchemaField(name, type, mode)
+    else:
         return None
 
-    return augmented_schema
-    # pytype: enable=attribute-error
-
 
 def dataframe_to_arrow(dataframe, bq_schema):
     """Convert pandas dataframe to Arrow table, using BigQuery schema.
diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py
@@ -284,15 +284,13 @@ def name(self):
         return self._properties.get("name", "")
 
     @property
-    def field_type(self):
+    def field_type(self) -> str:
         """str: The type of the field.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
         """
         type_ = self._properties.get("type")
-        if type_ is None:  # Shouldn't happen, but some unit tests do this.
-            return None
         return cast(str, type_).upper()
 
     @property
diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py
@@ -1568,31 +1568,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test):
     # set to "datetime64[ns]", and pyarrow converts that to pyarrow.TimestampArray.
     # We thus cannot expect to get a DATETIME date when converting back to the
     # BigQuery type.
-
-    current_schema = (
-        schema.SchemaField("bool_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("int_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("float_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("time_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("timestamp_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("date_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("string_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"),
-    )
-
-    with warnings.catch_warnings(record=True) as warned:
-        augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
-
-    # there should be no relevant warnings
-    unwanted_warnings = [
-        warning for warning in warned if "Pyarrow could not" in str(warning)
-    ]
-    assert not unwanted_warnings
-
-    # the augmented schema must match the expected
-    expected_schema = (
+    expected_schemas = (
         schema.SchemaField("bool_field", field_type="BOOL", mode="NULLABLE"),
         schema.SchemaField("int_field", field_type="INT64", mode="NULLABLE"),
         schema.SchemaField("float_field", field_type="FLOAT64", mode="NULLABLE"),
@@ -1607,8 +1583,13 @@ def test_augment_schema_type_detection_succeeds(module_under_test):
         ),
     )
 
-    by_name = operator.attrgetter("name")
-    assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name)
+    for col_name, expected_schema in zip(dataframe, expected_schemas):
+        with warnings.catch_warnings(record=True) as warned:
+            schema_field = module_under_test._get_schema_by_pyarrow(
+                col_name, dataframe[col_name]
+            )
+            assert warned == []
+            assert schema_field == expected_schema
 
 
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1639,30 +1620,20 @@ def test_augment_schema_repeated_fields(module_under_test):
         ]
     )
 
-    current_schema = (
-        schema.SchemaField("string_array", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("timestamp_array", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("datetime_array", field_type=None, mode="NULLABLE"),
-    )
-
-    with warnings.catch_warnings(record=True) as warned:
-        augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
-
-    # there should be no relevant warnings
-    unwanted_warnings = [
-        warning for warning in warned if "Pyarrow could not" in str(warning)
-    ]
-    assert not unwanted_warnings
-
     # the augmented schema must match the expected
-    expected_schema = (
+    expected_schemas = (
         schema.SchemaField("string_array", field_type="STRING", mode="REPEATED"),
         schema.SchemaField("timestamp_array", field_type="TIMESTAMP", mode="REPEATED"),
         schema.SchemaField("datetime_array", field_type="DATETIME", mode="REPEATED"),
     )
 
-    by_name = operator.attrgetter("name")
-    assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name)
+    for col_name, expected_schema in zip(dataframe, expected_schemas):
+        with warnings.catch_warnings(record=True) as warned:
+            schema_field = module_under_test._get_schema_by_pyarrow(
+                col_name, dataframe[col_name]
+            )
+            assert warned == []
+            assert schema_field == expected_schema
 
 
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1681,48 +1652,36 @@ def test_augment_schema_type_detection_fails(module_under_test):
             },
         ]
     )
-    current_schema = [
-        schema.SchemaField("status", field_type="STRING", mode="NULLABLE"),
-        schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"),
-    ]
-
-    with warnings.catch_warnings(record=True) as warned:
-        augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
 
-    assert augmented_schema is None
+    expected_schemas = (
+        schema.SchemaField("status", field_type="STRING", mode="NULLABLE"),
+        # Could not determine the type of these columns
+        None,
+        None,
+    )
 
-    expected_warnings = [
-        warning for warning in warned if "could not determine" in str(warning)
-    ]
-    assert len(expected_warnings) == 1
-    warning_msg = str(expected_warnings[0])
-    assert "pyarrow" in warning_msg.lower()
-    assert "struct_field" in warning_msg and "struct_field_2" in warning_msg
+    for col_name, expected_schema in zip(dataframe, expected_schemas):
+        with warnings.catch_warnings(record=True) as warned:
+            schema_field = module_under_test._get_schema_by_pyarrow(
+                col_name, dataframe[col_name]
+            )
+            assert warned == []
+            assert schema_field == expected_schema
 
 
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_augment_schema_type_detection_fails_array_data(module_under_test):
     dataframe = pandas.DataFrame(
         data=[{"all_none_array": [None, float("NaN")], "empty_array": []}]
     )
-    current_schema = [
-        schema.SchemaField("all_none_array", field_type=None, mode="NULLABLE"),
-        schema.SchemaField("empty_array", field_type=None, mode="NULLABLE"),
-    ]
-
-    with warnings.catch_warnings(record=True) as warned:
-        augmented_schema = module_under_test.augment_schema(dataframe, current_schema)
 
-    assert augmented_schema is None
-
-    expected_warnings = [
-        warning for warning in warned if "could not determine" in str(warning)
-    ]
-    assert len(expected_warnings) == 1
-    warning_msg = str(expected_warnings[0])
-    assert "pyarrow" in warning_msg.lower()
-    assert "all_none_array" in warning_msg and "empty_array" in warning_msg
+    for col_name in dataframe:
+        with warnings.catch_warnings(record=True) as warned:
+            schema_field = module_under_test._get_schema_by_pyarrow(
+                col_name, dataframe[col_name]
+            )
+            assert warned == []
+            assert schema_field == None
 
 
 @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -640,11 +640,6 @@ def test___repr__(self):
         expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, None, (), None)"
         self.assertEqual(repr(field1), expected)
 
-    def test___repr__type_not_set(self):
-        field1 = self._make_one("field1", field_type=None)
-        expected = "SchemaField('field1', None, 'NULLABLE', None, None, (), None)"
-        self.assertEqual(repr(field1), expected)
-
     def test___repr__evaluable_no_policy_tags(self):
         field = self._make_one("field1", "STRING", "REQUIRED", "Description")
         field_repr = repr(field)