From 487ca3e7d5a61acbeab833ad405ad77494bea2e9 Mon Sep 17 00:00:00 2001 From: Linchin Date: Wed, 8 May 2024 17:25:55 -0700 Subject: [PATCH 1/2] fix: add pyarrow version check for range support --- google/cloud/bigquery/_versions_helpers.py | 10 +++++ google/cloud/bigquery/table.py | 48 +++++++--------------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 50d5961b3..9e534db51 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -26,6 +26,9 @@ _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") _MIN_PANDAS_VERSION = packaging.version.Version("1.1.0") +_MIN_PANDAS_VERSION_RANGE = packaging.version.Version("1.5.0") +_MIN_PYARROW_VERSION_RANGE = packaging.version.Version("10.0.1") + class PyarrowVersions: """Version comparisons for pyarrow package.""" @@ -234,3 +237,10 @@ def try_import(self, raise_if_error: bool = False) -> Any: PANDAS_VERSIONS = PandasVersions() + +SUPPORTS_RANGE_PYARROW = ( + PANDAS_VERSIONS.try_import() is not None + and PANDAS_VERSIONS.installed_version >= _MIN_PANDAS_VERSION_RANGE + and PYARROW_VERSIONS.try_import() is not None + and PYARROW_VERSIONS.installed_version >= _MIN_PYARROW_VERSION_RANGE +) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 2f07bcc78..ad1253195 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -100,6 +100,12 @@ "because the necessary `__from_arrow__` attribute is missing." ) +_RANGE_PYARROW_WARNING = ( + "Unable to represent RANGE schema as struct using pandas ArrowDtype. Using " + "`object` instead. To use ArrowDtype, use pandas >= 1.5 and " + "pyarrow >= 10.0.1." +) + # How many of the total rows need to be downloaded already for us to skip # calling the BQ Storage API? ALMOST_COMPLETELY_CACHED_RATIO = 0.333 @@ -2279,26 +2285,18 @@ def to_dataframe( time_dtype = db_dtypes.TimeDtype() if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: - try: + if _versions_helpers.SUPPORTS_RANGE_PYARROW: range_date_dtype = pandas.ArrowDtype( pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] ) ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_date_dtype to None. - msg = ( - "Unable to find class ArrowDtype in pandas, setting " - "range_date_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) + else: + warnings.warn(_RANGE_PYARROW_WARNING) range_date_dtype = None if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: - try: + if _versions_helpers.SUPPORTS_RANGE_PYARROW: range_datetime_dtype = pandas.ArrowDtype( pyarrow.struct( [ @@ -2307,20 +2305,12 @@ def to_dataframe( ] ) ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_datetime_dtype to None. - msg = ( - "Unable to find class ArrowDtype in pandas, setting " - "range_datetime_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) + else: + warnings.warn(_RANGE_PYARROW_WARNING) range_datetime_dtype = None if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: - try: + if _versions_helpers.SUPPORTS_RANGE_PYARROW: range_timestamp_dtype = pandas.ArrowDtype( pyarrow.struct( [ @@ -2329,16 +2319,8 @@ def to_dataframe( ] ) ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_timestamp_dtype to None. - msg = ( - "Unable to find class ArrowDtype in pandas, setting " - "range_timestamp_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) + else: + warnings.warn(_RANGE_PYARROW_WARNING) range_timestamp_dtype = None if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): From dab8af5157f21dad5757f7b117393a0d9ec98914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 9 May 2024 09:28:03 -0500 Subject: [PATCH 2/2] add comment why we are making a separate constant --- google/cloud/bigquery/_versions_helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 9e534db51..72d4c921d 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -238,6 +238,10 @@ def try_import(self, raise_if_error: bool = False) -> Any: PANDAS_VERSIONS = PandasVersions() +# Since RANGE support in pandas requires specific versions +# of both pyarrow and pandas, we make this a separate +# constant instead of as a property of PANDAS_VERSIONS +# or PYARROW_VERSIONS. SUPPORTS_RANGE_PYARROW = ( PANDAS_VERSIONS.try_import() is not None and PANDAS_VERSIONS.installed_version >= _MIN_PANDAS_VERSION_RANGE