diff --git a/doc/progress.rst b/doc/progress.rst index e2472f749..e599a0ad3 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -9,7 +9,6 @@ Changelog 0.13.1 ~~~~~~ - * ADD #1081 #1132: Add additional options for (not) downloading datasets ``openml.datasets.get_dataset`` and cache management. * ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``). * ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server. * ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API. diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index 06da3aec8..a1e2556be 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -62,11 +62,5 @@ def __init__( def __repr__(self): return "[%d - %s (%s)]" % (self.index, self.name, self.data_type) - def __eq__(self, other): - if not isinstance(other, OpenMLDataFeature): - return False - - return self.__dict__ == other.__dict__ - def _repr_pretty_(self, pp, cycle): pp.text(str(self)) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index d7ebbd0d6..a506ca450 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -7,7 +7,6 @@ import os import pickle from typing import List, Optional, Union, Tuple, Iterable, Dict -import warnings import arff import numpy as np @@ -19,6 +18,7 @@ from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError + logger = logging.getLogger(__name__) @@ -212,22 +212,17 @@ def find_invalid_characters(string, pattern): self._dataset = dataset self._minio_url = minio_url - self._features = None # type: Optional[Dict[int, OpenMLDataFeature]] - self._qualities = None # type: Optional[Dict[str, float]] - self._no_qualities_found = False - if features_file is not None: - self._features = _read_features(features_file) - - if qualities_file == "": - # TODO(0.15): to switch to "qualities_file is not None" below and remove warning - warnings.warn( - "Starting from Version 0.15 `qualities_file` must be None and not an empty string.", - FutureWarning, - ) + self.features = _read_features( + features_file + ) # type: Optional[Dict[int, OpenMLDataFeature]] + else: + self.features = None if qualities_file: - self._qualities = _read_qualities(qualities_file) + self.qualities = _read_qualities(qualities_file) # type: Optional[Dict[str, float]] + else: + self.qualities = None if data_file is not None: rval = self._compressed_cache_file_paths(data_file) @@ -239,36 +234,12 @@ def find_invalid_characters(string, pattern): self.data_feather_file = None self.feather_attribute_file = None - @property - def features(self): - # Lazy loading of features - if self._features is None: - self._load_metadata(features=True) - - return self._features - - @property - def qualities(self): - # Lazy loading of qualities - # We have to check `_no_qualities_found` as there might not be qualities for a dataset - if self._qualities is None and (not self._no_qualities_found): - self._load_metadata(qualities=True) - - return self._qualities - @property def id(self) -> Optional[int]: return self.dataset_id def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: """Collect all information to display in the __repr__ body.""" - - # Obtain number of features in accordance with lazy loading. - if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None: - n_features = int(self._qualities["NumberOfFeatures"]) # type: Optional[int] - else: - n_features = len(self._features) if self._features is not None else None - fields = { "Name": self.name, "Version": self.version, @@ -277,14 +248,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: "Download URL": self.url, "Data file": self.data_file, "Pickle file": self.data_pickle_file, - "# of features": n_features, + "# of features": len(self.features) if self.features is not None else None, } if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace("T", " ") if self.dataset_id is not None: fields["OpenML URL"] = self.openml_url - if self._qualities is not None and self._qualities["NumberOfInstances"] is not None: - fields["# of instances"] = int(self._qualities["NumberOfInstances"]) + if self.qualities is not None and self.qualities["NumberOfInstances"] is not None: + fields["# of instances"] = int(self.qualities["NumberOfInstances"]) # determines the order in which the information will be printed order = [ @@ -802,40 +773,6 @@ def get_data( return data, targets, categorical, attribute_names - def _load_metadata(self, features: bool = False, qualities: bool = False): - """Load the missing metadata information from the server and store it in the - dataset object. - - The purpose of the function is to support lazy loading. - - Parameters - ---------- - features : bool (default=False) - If True, load the `self.features` data if not already loaded. - qualities: bool (default=False) - If True, load the `self.qualities` data if not already loaded. - """ - # Delayed Import to avoid circular imports or having to import all of dataset.functions to - # import OpenMLDataset - from openml.datasets.functions import _get_dataset_metadata - - if self.dataset_id is None: - raise ValueError( - """No dataset id specified. Please set the dataset id. - Otherwise we cannot load metadata.""" - ) - - features_file, qualities_file = _get_dataset_metadata( - self.dataset_id, features=features, qualities=qualities - ) - - if features_file is not None: - self._features = _read_features(features_file) - - if qualities_file is not None: - self._qualities = _read_qualities(qualities_file) - self._no_qualities_found = self._qualities is None - def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]: """Reads the datasets arff to determine the class-labels. @@ -853,6 +790,10 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[ ------- list """ + if self.features is None: + raise ValueError( + "retrieve_class_labels can only be called if feature information is available." + ) for feature in self.features.values(): if (feature.name == target_name) and (feature.data_type == "nominal"): return feature.nominal_values @@ -981,7 +922,6 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": return data_container -# -- Code for Features Property def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]: features_pickle_file = _get_features_pickle_file(features_file) try: @@ -990,41 +930,35 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]: except: # noqa E722 with open(features_file, encoding="utf8") as fh: features_xml_string = fh.read() - - features = _parse_features_xml(features_xml_string) + xml_dict = xmltodict.parse( + features_xml_string, force_list=("oml:feature", "oml:nominal_value") + ) + features_xml = xml_dict["oml:data_features"] + + features = {} + for idx, xmlfeature in enumerate(features_xml["oml:feature"]): + nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["oml:index"]), + xmlfeature["oml:name"], + xmlfeature["oml:data_type"], + xmlfeature.get("oml:nominal_value"), + int(nr_missing), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature with open(features_pickle_file, "wb") as fh_binary: pickle.dump(features, fh_binary) return features -def _parse_features_xml(features_xml_string): - xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value")) - features_xml = xml_dict["oml:data_features"] - - features = {} - for idx, xmlfeature in enumerate(features_xml["oml:feature"]): - nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) - feature = OpenMLDataFeature( - int(xmlfeature["oml:index"]), - xmlfeature["oml:name"], - xmlfeature["oml:data_type"], - xmlfeature.get("oml:nominal_value"), - int(nr_missing), - ) - if idx != feature.index: - raise ValueError("Data features not provided in right order") - features[feature.index] = feature - - return features - - def _get_features_pickle_file(features_file: str) -> str: """This function only exists so it can be mocked during unit testing""" return features_file + ".pkl" -# -- Code for Qualities Property def _read_qualities(qualities_file: str) -> Dict[str, float]: qualities_pickle_file = _get_qualities_pickle_file(qualities_file) try: @@ -1033,12 +967,19 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]: except: # noqa E722 with open(qualities_file, encoding="utf8") as fh: qualities_xml = fh.read() - qualities = _parse_qualities_xml(qualities_xml) + xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) + qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] + qualities = _check_qualities(qualities) with open(qualities_pickle_file, "wb") as fh_binary: pickle.dump(qualities, fh_binary) return qualities +def _get_qualities_pickle_file(qualities_file: str) -> str: + """This function only exists so it can be mocked during unit testing""" + return qualities_file + ".pkl" + + def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]: qualities_ = {} for xmlquality in qualities: @@ -1051,14 +992,3 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]: value = float(xmlquality["oml:value"]) qualities_[name] = value return qualities_ - - -def _parse_qualities_xml(qualities_xml): - xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) - qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] - return _check_qualities(qualities) - - -def _get_qualities_pickle_file(qualities_file: str) -> str: - """This function only exists so it can be mocked during unit testing""" - return qualities_file + ".pkl" diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index e8b7992e2..8847f4d04 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -4,7 +4,7 @@ import logging import os from pyexpat import ExpatError -from typing import List, Dict, Union, Optional, cast, Tuple +from typing import List, Dict, Union, Optional, cast import warnings import numpy as np @@ -25,12 +25,15 @@ OpenMLServerException, OpenMLPrivateDatasetError, ) -from ..utils import _remove_cache_dir_for_id, _create_cache_directory_for_id, _get_cache_dir_for_id +from ..utils import ( + _remove_cache_dir_for_id, + _create_cache_directory_for_id, +) + DATASETS_CACHE_DIR_NAME = "datasets" logger = logging.getLogger(__name__) - ############################################################################ # Local getters/accessors to the cache directory @@ -347,28 +350,18 @@ def get_datasets( @openml.utils.thread_safe_if_oslo_installed def get_dataset( dataset_id: Union[int, str], - download_data: Optional[bool] = None, # Optional for deprecation warning; later again only bool + download_data: bool = True, version: Optional[int] = None, error_if_multiple: bool = False, cache_format: str = "pickle", - download_qualities: Optional[bool] = None, # Same as above - download_features_meta_data: Optional[bool] = None, # Same as above + download_qualities: bool = True, download_all_files: bool = False, - force_refresh_cache: bool = False, ) -> OpenMLDataset: """Download the OpenML dataset representation, optionally also download actual data file. - This function is by default NOT thread/multiprocessing safe, as this function uses caching. - A check will be performed to determine if the information has previously been downloaded to a - cache, and if so be loaded from disk instead of retrieved from the server. - - To make this function thread safe, you can install the python package ``oslo.concurrency``. - If ``oslo.concurrency`` is installed `get_dataset` becomes thread safe. - - Alternatively, to make this function thread/multiprocessing safe initialize the cache first by - calling `get_dataset(args)` once before calling `get_datasett(args)` many times in parallel. - This will initialize the cache and later calls will use the cache in a thread/multiprocessing - safe way. + This function is thread/multiprocessing safe. + This function uses caching. A check will be performed to determine if the information has + previously been downloaded, and if so be loaded from disk instead of retrieved from the server. If dataset is retrieved by name, a version may be specified. If no version is specified and multiple versions of the dataset exist, @@ -390,55 +383,21 @@ def get_dataset( If no version is specified, retrieve the least recent still active version. error_if_multiple : bool (default=False) If ``True`` raise an error if multiple datasets are found with matching criteria. - cache_format : str (default='pickle') in {'pickle', 'feather'} + cache_format : str (default='pickle') Format for caching the dataset - may be feather or pickle Note that the default 'pickle' option may load slower than feather when no.of.rows is very high. download_qualities : bool (default=True) Option to download 'qualities' meta-data in addition to the minimal dataset description. - If True, download and cache the qualities file. - If False, create the OpenMLDataset without qualities metadata. The data may later be added - to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method. - download_features_meta_data : bool (default=True) - Option to download 'features' meta-data in addition to the minimal dataset description. - If True, download and cache the features file. - If False, create the OpenMLDataset without features metadata. The data may later be added - to the OpenMLDataset through the `OpenMLDataset.load_metadata(features=True)` method. download_all_files: bool (default=False) EXPERIMENTAL. Download all files related to the dataset that reside on the server. Useful for datasets which refer to auxiliary files (e.g., meta-album). - force_refresh_cache : bool (default=False) - Force the cache to refreshed by deleting the cache directory and re-downloading the data. - Note, if `force_refresh_cache` is True, `get_dataset` is NOT thread/multiprocessing safe, - because this creates a race condition to creating and deleting the cache; as in general with - the cache. Returns ------- dataset : :class:`openml.OpenMLDataset` The downloaded dataset. """ - # TODO(0.15): Remove the deprecation warning and make the default False; adjust types above - # and documentation. Also remove None-to-True-cases below - if any( - download_flag is None - for download_flag in [download_data, download_qualities, download_features_meta_data] - ): - warnings.warn( - "Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu" - "res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy " - "loading. To disable this message until version 0.15 explicitly set `download_data`, " - "`download_qualities`, and `download_features_meta_data` to a bool while calling " - "`get_dataset`.", - FutureWarning, - ) - - download_data = True if download_data is None else download_data - download_qualities = True if download_qualities is None else download_qualities - download_features_meta_data = ( - True if download_features_meta_data is None else download_features_meta_data - ) - if download_all_files: warnings.warn( "``download_all_files`` is experimental and is likely to break with new releases." @@ -460,15 +419,6 @@ def get_dataset( "`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id)) ) - # Developer Documentation: we could also (quite heavily) re-implement the below to only download - # the data and do not cache the data at all. This would always be thread/multiprocessing safe. - # However, this would likely drastically increase the strain on the server and make working with - # OpenML really slow. Hence, we stick to the alternatives mentioned in the docstring. - if force_refresh_cache: - did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - if os.path.exists(did_cache_dir): - _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) - did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, dataset_id, @@ -477,10 +427,19 @@ def get_dataset( remove_dataset_cache = True try: description = _get_dataset_description(did_cache_dir, dataset_id) + features_file = _get_dataset_features_file(did_cache_dir, dataset_id) - features_file, qualities_file = _get_dataset_metadata( - dataset_id, download_features_meta_data, download_qualities, did_cache_dir - ) + try: + if download_qualities: + qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) + else: + qualities_file = "" + except OpenMLServerException as e: + if e.code == 362 and str(e) == "No qualities found - None": + logger.warning("No qualities found for dataset {}".format(dataset_id)) + qualities_file = None + else: + raise arff_file = _get_dataset_arff(description) if download_data else None if "oml:minio_url" in description and download_data: @@ -1142,11 +1101,6 @@ def _get_dataset_arff( return output_file_path -def _get_features_xml(dataset_id): - url_extension = "data/features/{}".format(dataset_id) - return openml._api_calls._perform_api_call(url_extension, "get") - - def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str: """API call to load dataset features. Loads from cache or downloads them. @@ -1172,18 +1126,14 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str: # Dataset features aren't subject to change... if not os.path.isfile(features_file): - features_xml = _get_features_xml(dataset_id) + url_extension = "data/features/{}".format(dataset_id) + features_xml = openml._api_calls._perform_api_call(url_extension, "get") with io.open(features_file, "w", encoding="utf8") as fh: fh.write(features_xml) return features_file -def _get_qualities_xml(dataset_id): - url_extension = "data/qualities/{}".format(dataset_id) - return openml._api_calls._perform_api_call(url_extension, "get") - - def _get_dataset_qualities_file(did_cache_dir, dataset_id): """API call to load dataset qualities. Loads from cache or downloads them. @@ -1212,67 +1162,17 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id): with io.open(qualities_file, encoding="utf8") as fh: qualities_xml = fh.read() except (OSError, IOError): - qualities_xml = _get_qualities_xml(dataset_id) + url_extension = "data/qualities/{}".format(dataset_id) + qualities_xml = openml._api_calls._perform_api_call(url_extension, "get") with io.open(qualities_file, "w", encoding="utf8") as fh: fh.write(qualities_xml) return qualities_file -def _get_dataset_metadata( - dataset_id: int, features: bool, qualities: bool, did_cache_dir: Optional[str] = None -) -> Tuple[Union[str, None], Union[str, None]]: - """Download the files and initialize the cache for the metadata for a dataset. If the cache is - already initialized, the files are only loaded from the cache. - - This includes the features and qualities of the dataset. - - Parameters - ---------- - dataset_id: int - ID of the dataset for which the metadata is requested. - features: bool - Whether to return the features in the metadata. - qualities - - did_cache_dir - - Returns - ------- - features_file: str or None - Path to the features file. None if features=False. - qualities_file: str or None - Path to the qualities file. None if qualities=False. - """ - - # Init cache directory if needed - if did_cache_dir is None: - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - dataset_id, - ) - features_file = None - qualities_file = None - - if features: - features_file = _get_dataset_features_file(did_cache_dir, dataset_id) - - if qualities: - try: - qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) - except OpenMLServerException as e: - if e.code == 362 and str(e) == "No qualities found - None": - # quality file stays as None - logger.warning("No qualities found for dataset {}".format(dataset_id)) - else: - raise - - return features_file, qualities_file - - def _create_dataset_from_description( description: Dict[str, str], - features_file: Optional[str] = None, - qualities_file: Optional[str] = None, + features_file: str, + qualities_file: str, arff_file: Optional[str] = None, parquet_file: Optional[str] = None, cache_format: str = "pickle", diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 1e5f519df..8ca0b0651 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -102,8 +102,8 @@ def run_model_on_task( warnings.warn( "avoid_duplicate_runs is set to True, but no API key is set. " "Please set your API key in the OpenML configuration file, see" - "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial" - + ".html#authentication for more information on authentication.", + "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication" + "for more information on authentication.", ) # TODO: At some point in the future do not allow for arguments in old order (6-2018). diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 798318d2f..8ee372141 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -23,6 +23,7 @@ import openml.utils import openml._api_calls + TASKS_CACHE_DIR_NAME = "tasks" @@ -326,54 +327,31 @@ def get_tasks( @openml.utils.thread_safe_if_oslo_installed def get_task( - task_id: int, *dataset_args, download_splits: Optional[bool] = None, **get_dataset_kwargs + task_id: int, download_data: bool = True, download_qualities: bool = True ) -> OpenMLTask: """Download OpenML task for a given task ID. - Downloads the task representation. By default, this will also download the data splits and - the dataset. From version 0.15.0 onwards, the splits will not be downloaded by default - nor the dataset. - - Use the `download_splits` parameter to control whether the splits are downloaded. - Moreover, you may pass additional parameter (args or kwargs) that are passed to - :meth:`openml.datasets.get_dataset`. - For backwards compatibility, if `download_data` is passed as an additional parameter and - `download_splits` is not explicitly set, `download_data` also overrules `download_splits`'s - value (deprecated from Version 0.15.0 onwards). + Downloads the task representation, while the data splits can be + downloaded optionally based on the additional parameter. Else, + splits will either way be downloaded when the task is being used. Parameters ---------- task_id : int The OpenML task id of the task to download. - download_splits: bool (default=True) - Whether to download the splits as well. From version 0.15.0 onwards this is independent - of download_data and will default to ``False``. - dataset_args, get_dataset_kwargs : - Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`. - This includes `download_data`. If set to True the splits are downloaded as well - (deprecated from Version 0.15.0 onwards). The args are only present for backwards - compatibility and will be removed from version 0.15.0 onwards. + download_data : bool (default=True) + Option to trigger download of data along with the meta data. + download_qualities : bool (default=True) + Option to download 'qualities' meta-data in addition to the minimal dataset description. Returns ------- - task: OpenMLTask + task """ - if download_splits is None: - # TODO(0.15): Switch download splits to False by default, adjust typing above, adjust - # documentation above, and remove warning. - warnings.warn( - "Starting from Version 0.15.0 `download_splits` will default to ``False`` instead " - "of ``True`` and be independent from `download_data`. To disable this message until " - "version 0.15 explicitly set `download_splits` to a bool.", - FutureWarning, - ) - download_splits = get_dataset_kwargs.get("download_data", True) - if not isinstance(task_id, int): - # TODO(0.15): Remove warning warnings.warn( "Task id must be specified as `int` from 0.14.0 onwards.", - FutureWarning, + DeprecationWarning, ) try: @@ -388,15 +366,15 @@ def get_task( try: task = _get_task_description(task_id) - dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs) - # List of class labels available in dataset description + dataset = get_dataset(task.dataset_id, download_data, download_qualities=download_qualities) + # List of class labels availaible in dataset description # Including class labels as part of task meta data handles # the case where data download was initially disabled if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): task.class_labels = dataset.retrieve_class_labels(task.target_name) # Clustering tasks do not have class labels # and do not offer download_split - if download_splits: + if download_data: if isinstance(task, OpenMLSupervisedTask): task.download_split() except Exception as e: diff --git a/openml/utils.py b/openml/utils.py index ffcc308dd..7f99fbba2 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: from openml.base import OpenMLBase + oslo_installed = False try: # Currently, importing oslo raises a lot of warning that it will stop working @@ -302,33 +303,18 @@ def _list_all(listing_call, output_format="dict", *args, **filters): return result -def _get_cache_dir_for_key(key): - cache = config.get_cache_directory() - return os.path.join(cache, key) - - def _create_cache_directory(key): - cache_dir = _get_cache_dir_for_key(key) - + cache = config.get_cache_directory() + cache_dir = os.path.join(cache, key) try: os.makedirs(cache_dir, exist_ok=True) except Exception as e: raise openml.exceptions.OpenMLCacheException( f"Cannot create cache directory {cache_dir}." ) from e - return cache_dir -def _get_cache_dir_for_id(key, id_, create=False): - if create: - cache_dir = _create_cache_directory(key) - else: - cache_dir = _get_cache_dir_for_key(key) - - return os.path.join(cache_dir, str(id_)) - - def _create_cache_directory_for_id(key, id_): """Create the cache directory for a specific ID @@ -350,7 +336,7 @@ def _create_cache_directory_for_id(key, id_): str Path of the created dataset cache directory. """ - cache_dir = _get_cache_dir_for_id(key, id_, create=True) + cache_dir = os.path.join(_create_cache_directory(key), str(id_)) if os.path.isdir(cache_dir): pass elif os.path.exists(cache_dir): diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 964a41294..f288f152a 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -262,36 +262,6 @@ def test_get_data_corrupt_pickle(self): self.assertIsInstance(xy, pd.DataFrame) self.assertEqual(xy.shape, (150, 5)) - def test_lazy_loading_metadata(self): - # Initial Setup - did_cache_dir = openml.utils._create_cache_directory_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, 2 - ) - _compare_dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=True, download_qualities=True - ) - change_time = os.stat(did_cache_dir).st_mtime - - # Test with cache - _dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=False, download_qualities=False - ) - self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime) - self.assertEqual(_dataset.features, _compare_dataset.features) - self.assertEqual(_dataset.qualities, _compare_dataset.qualities) - - # -- Test without cache - openml.utils._remove_cache_dir_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, did_cache_dir - ) - - _dataset = openml.datasets.get_dataset( - 2, download_data=False, download_features_meta_data=False, download_qualities=False - ) - self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime) - self.assertEqual(_dataset.features, _compare_dataset.features) - self.assertEqual(_dataset.qualities, _compare_dataset.qualities) - class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 749a1c6c0..2aa792b91 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -546,47 +546,8 @@ def test__get_dataset_qualities(self): self.assertTrue(os.path.exists(qualities_xml_path)) def test__get_dataset_skip_download(self): - dataset = openml.datasets.get_dataset( - 2, download_qualities=False, download_features_meta_data=False - ) - # Internal representation without lazy loading - self.assertIsNone(dataset._qualities) - self.assertIsNone(dataset._features) - # External representation with lazy loading - self.assertIsNotNone(dataset.qualities) - self.assertIsNotNone(dataset.features) - - def test_get_dataset_force_refresh_cache(self): - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - 2, - ) - openml.datasets.get_dataset(2) - change_time = os.stat(did_cache_dir).st_mtime - - # Test default - openml.datasets.get_dataset(2) - self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime) - - # Test refresh - openml.datasets.get_dataset(2, force_refresh_cache=True) - self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime) - - # Clean up - openml.utils._remove_cache_dir_for_id( - DATASETS_CACHE_DIR_NAME, - did_cache_dir, - ) - - # Test clean start - openml.datasets.get_dataset(2, force_refresh_cache=True) - self.assertTrue(os.path.exists(did_cache_dir)) - - # Final clean up - openml.utils._remove_cache_dir_for_id( - DATASETS_CACHE_DIR_NAME, - did_cache_dir, - ) + qualities = openml.datasets.get_dataset(2, download_qualities=False).qualities + self.assertIsNone(qualities) def test_deletion_of_cache_dir(self): # Simple removal