From 01bf45932fd3a70b972694f7bc3634c55fb476f2 Mon Sep 17 00:00:00 2001 From: Saesha <85289569+psaesha@users.noreply.github.com> Date: Thu, 18 Jun 2026 01:12:28 +0530 Subject: [PATCH] Revert "Dev" --- .gitignore | 2 - api/models/Resource.py | 7 +- api/schema/access_model_schema.py | 28 +- api/schema/aimodel_schema.py | 14 +- api/schema/dataset_schema.py | 1 - api/schema/resource_chart_schema.py | 22 +- api/schema/resource_schema.py | 74 ++-- api/schema/usecase_schema.py | 39 +- api/types/type_aimodel.py | 2 - api/types/type_collaborative.py | 22 +- api/types/type_collaborative_organization.py | 2 - api/types/type_metadata.py | 2 - api/types/type_resource_chart.py | 14 +- api/types/type_resource_chart_image.py | 2 - api/types/type_usecase.py | 18 +- api/types/type_usecase_organization.py | 2 - api/urls.py | 16 - api/utils/data_indexing.py | 224 +----------- api/utils/keycloak_utils.py | 6 +- api/views/dataset_data.py | 359 ------------------- dataspace_sdk/__version__.py | 2 +- dataspace_sdk/resources/datasets.py | 201 +---------- docs/dataset_data_api.md | 266 -------------- docs/sdk/README.md | 44 --- tests/test_data_indexing_filters.py | 174 --------- tests/test_datasets.py | 112 ------ tests/test_settings.py | 53 +-- 27 files changed, 178 insertions(+), 1530 deletions(-) delete mode 100644 api/views/dataset_data.py delete mode 100644 docs/dataset_data_api.md delete mode 100644 tests/test_data_indexing_filters.py diff --git a/.gitignore b/.gitignore index 18886c7e..8fd0cefe 100644 --- a/.gitignore +++ b/.gitignore @@ -168,5 +168,3 @@ files/public/* #dvc files dvc dvc/* - -.DS_Store diff --git a/api/models/Resource.py b/api/models/Resource.py index 745878f3..35562f26 100644 --- a/api/models/Resource.py +++ b/api/models/Resource.py @@ -188,12 +188,7 @@ def version_resource_with_dvc(sender, instance: ResourceFileDetails, created, ** # Create a temporary directory for the previous version with tempfile.TemporaryDirectory() as temp_dir: # Get the previous version file path - file_name = instance.file.name - if not file_name: - raise ValueError("File name is missing") - prev_file_name = ( - f"prev_version_{instance.resource.id}.{file_name.split('.')[-1]}" - ) + prev_file_name = f"prev_version_{instance.resource.id}.{instance.file.name.split('.')[-1]}" prev_file_path = os.path.join(temp_dir, prev_file_name) # Use DVC to get the previous version diff --git a/api/schema/access_model_schema.py b/api/schema/access_model_schema.py index d5dece23..84ed8b09 100644 --- a/api/schema/access_model_schema.py +++ b/api/schema/access_model_schema.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - import uuid from enum import Enum from typing import Any, Dict, List, Optional, Union @@ -51,7 +49,9 @@ class EditAccessModelInput: @strawberry.type(name="Query") class Query: @strawberry_django.field - def access_model_resources(self, info: Info, dataset_id: uuid.UUID) -> List[TypeAccessModel]: + def access_model_resources( + self, info: Info, dataset_id: uuid.UUID + ) -> List[TypeAccessModel]: models = AccessModel.objects.filter(dataset_id=dataset_id) return [TypeAccessModel.from_django(model) for model in models] @@ -88,12 +88,16 @@ def _add_update_access_model_resources( try: dataset_resource = Resource.objects.get(id=resource_input.resource) except Resource.DoesNotExist as e: - raise ValueError(f"Resource with ID {resource_input.resource} does not exist.") + raise ValueError( + f"Resource with ID {resource_input.resource} does not exist." + ) access_model_resource = AccessModelResource.objects.create( access_model=access_model, resource=dataset_resource ) - _add_resource_fields(access_model_resource, dataset_resource, resource_input.fields) + _add_resource_fields( + access_model_resource, dataset_resource, resource_input.fields + ) def _update_access_model_fields( @@ -118,13 +122,15 @@ def create_access_model( try: dataset = Dataset.objects.get(id=access_model_input.dataset) except Dataset.DoesNotExist: - raise ValueError(f"Dataset with ID {access_model_input.dataset} does not exist.") + raise ValueError( + f"Dataset with ID {access_model_input.dataset} does not exist." + ) access_model = AccessModel.objects.create( dataset=dataset, name=access_model_input.name, description=access_model_input.description, - type=access_model_input.type.value, # type: ignore[attr-defined] + type=access_model_input.type.value, ) _update_access_model_fields(access_model, access_model_input) @@ -139,11 +145,15 @@ def edit_access_model( try: dataset = Dataset.objects.get(id=access_model_input.dataset) except Dataset.DoesNotExist as e: - raise ValueError(f"Dataset with ID {access_model_input.dataset} does not exist.") + raise ValueError( + f"Dataset with ID {access_model_input.dataset} does not exist." + ) access_model = AccessModel.objects.create(dataset=dataset) else: try: - access_model = AccessModel.objects.get(id=access_model_input.access_model_id) + access_model = AccessModel.objects.get( + id=access_model_input.access_model_id + ) except AccessModel.DoesNotExist as e: raise ValueError( f"Access Model with ID {access_model_input.access_model_id} does not exist." diff --git a/api/schema/aimodel_schema.py b/api/schema/aimodel_schema.py index 0b059a45..f2075a22 100644 --- a/api/schema/aimodel_schema.py +++ b/api/schema/aimodel_schema.py @@ -1,9 +1,9 @@ """GraphQL schema for AI Model.""" -# mypy: disable-error-code="union-attr,misc,valid-type" +# mypy: disable-error-code="union-attr,misc" import datetime -from typing import Any, Dict, List, Optional +from typing import List, Optional import strawberry import strawberry_django @@ -419,14 +419,14 @@ def create_ai_model( description = input.description or "" # Prepare supported_languages - supported_languages: List[str] = input.supported_languages or [] + supported_languages = input.supported_languages or [] # Prepare schemas - input_schema: Any = input.input_schema or {} - output_schema: Any = input.output_schema or {} + input_schema = input.input_schema or {} + output_schema = input.output_schema or {} # Prepare metadata - metadata: Any = input.metadata or {} + metadata = input.metadata or {} try: model = AIModel.objects.create( @@ -802,7 +802,7 @@ def create_ai_model_version( ai_model=model, version=input.version, version_notes=input.version_notes or "", - lifecycle_stage=input.lifecycle_stage.value if input.lifecycle_stage else "DEVELOPMENT", # type: ignore[attr-defined] + lifecycle_stage=input.lifecycle_stage.value if input.lifecycle_stage else "DEVELOPMENT", # type: ignore[misc] supports_streaming=input.supports_streaming, max_tokens=input.max_tokens, supported_languages=input.supported_languages or [], diff --git a/api/schema/dataset_schema.py b/api/schema/dataset_schema.py index d70d1cb2..e0adc049 100644 --- a/api/schema/dataset_schema.py +++ b/api/schema/dataset_schema.py @@ -1,5 +1,4 @@ # mypy: disable-error-code=union-attr -# mypy: disable-error-code=valid-type import datetime import uuid from typing import Any, List, Optional, Union diff --git a/api/schema/resource_chart_schema.py b/api/schema/resource_chart_schema.py index bf8d04da..acb34b78 100644 --- a/api/schema/resource_chart_schema.py +++ b/api/schema/resource_chart_schema.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code=valid-type - import datetime import uuid from typing import Any, Dict, List, Optional @@ -21,12 +19,16 @@ @strawberry.type(name="Query") class Query: @strawberry_django.field - def charts_details(self, info: Info, dataset_id: uuid.UUID) -> List[TypeResourceChart]: + def charts_details( + self, info: Info, dataset_id: uuid.UUID + ) -> List[TypeResourceChart]: charts = ResourceChartDetails.objects.filter(resource__dataset_id=dataset_id) return [TypeResourceChart.from_django(chart) for chart in charts] @strawberry_django.field - def resource_chart(self, info: Info, chart_details_id: uuid.UUID) -> TypeResourceChart: + def resource_chart( + self, info: Info, chart_details_id: uuid.UUID + ) -> TypeResourceChart: chart = ResourceChartDetails.objects.get(id=chart_details_id) return TypeResourceChart.from_django(chart) @@ -186,10 +188,14 @@ def _update_chart_fields( if value: # Only process if list is not empty options[field_name] = [ { - "field": ResourceSchema.objects.get(id=column.field_name), + "field": ResourceSchema.objects.get( + id=column.field_name + ), "label": column.label, "color": column.color, - "value_mapping": _update_value_mapping(column.value_mapping), + "value_mapping": _update_value_mapping( + column.value_mapping + ), } for column in value ] @@ -289,7 +295,9 @@ def create_resource_chart( ) ], ) - def edit_resource_chart(self, info: Info, chart_input: ResourceChartInput) -> TypeResourceChart: + def edit_resource_chart( + self, info: Info, chart_input: ResourceChartInput + ) -> TypeResourceChart: if not chart_input.chart_id: chart = ResourceChartDetails() else: diff --git a/api/schema/resource_schema.py b/api/schema/resource_schema.py index 85fbc965..06414f71 100644 --- a/api/schema/resource_schema.py +++ b/api/schema/resource_schema.py @@ -3,7 +3,6 @@ from enum import Enum # mypy: disable-error-code=operator -# mypy: disable-error-code=valid-type from typing import List, Optional import strawberry @@ -112,7 +111,9 @@ class Query: @strawberry_django.field @trace_resolver(name="get_dataset_resources", attributes={"component": "resource"}) - def dataset_resources(self, info: Info, dataset_id: uuid.UUID) -> List[TypeResource]: + def dataset_resources( + self, info: Info, dataset_id: uuid.UUID + ) -> List[TypeResource]: """Get resources for a dataset.""" resources = Resource.objects.filter(dataset_id=dataset_id) return [TypeResource.from_django(resource) for resource in resources] @@ -161,17 +162,23 @@ def _reset_file_resource_schema(resource: Resource) -> None: data_table = index_resource_data(resource) -def _update_file_resource_schema(resource: Resource, updated_schema: List[SchemaUpdate]) -> None: +def _update_file_resource_schema( + resource: Resource, updated_schema: List[SchemaUpdate] +) -> None: """Update file resource schema and re-index if necessary.""" # Check if we need to re-index after schema update format_changes = False # Update schema fields - existing_schema: QuerySet[ResourceSchema] = ResourceSchema.objects.filter(resource=resource) + existing_schema: QuerySet[ResourceSchema] = ResourceSchema.objects.filter( + resource=resource + ) for schema in existing_schema: # type: ResourceSchema try: - schema_change = next(item for item in updated_schema if item.id == str(schema.id)) + schema_change = next( + item for item in updated_schema if item.id == str(schema.id) + ) # Check if format is changing, which might require re-indexing if schema.format != schema_change.format.value: format_changes = True @@ -181,7 +188,9 @@ def _update_file_resource_schema(resource: Resource, updated_schema: List[Schema schema.format = schema_change.format.value schema.save() - logger.info(f"Updated schema field {schema.field_name} for resource {resource.id}") + logger.info( + f"Updated schema field {schema.field_name} for resource {resource.id}" + ) except StopIteration: continue @@ -201,8 +210,12 @@ def _update_resource_preview_details( if file_resource_input.preview_details: # If preview_details already exists, update it if preview_details: - preview_details.is_all_entries = file_resource_input.preview_details.is_all_entries - preview_details.start_entry = file_resource_input.preview_details.start_entry + preview_details.is_all_entries = ( + file_resource_input.preview_details.is_all_entries + ) + preview_details.start_entry = ( + file_resource_input.preview_details.start_entry + ) preview_details.end_entry = file_resource_input.preview_details.end_entry preview_details.save() # Otherwise, create a new one @@ -247,9 +260,9 @@ def create_file_resources( raise ValueError(f"Dataset with ID {dataset_id} does not exist.") for file in file_resource_input.files: - resource = Resource.objects.create(name=file.name, dataset=dataset) # type: ignore[attr-defined] + resource = Resource.objects.create(name=file.name, dataset=dataset) ResourceFileDetails.objects.create( - file=file, size=file.size, resource=resource # type: ignore[attr-defined] + file=file, size=file.size, resource=resource ) _validate_file_details_and_update_format(resource) _create_file_resource_schema(resource) @@ -292,7 +305,11 @@ def create_file_resource( "resource_id": str(result.id), "resource_name": result.name, "updated_fields": { - "name": (file_resource_input.name if file_resource_input.name else None), + "name": ( + file_resource_input.name + if file_resource_input.name + else None + ), "description": ( file_resource_input.description if file_resource_input.description is not None @@ -300,7 +317,8 @@ def create_file_resource( ), "preview_enabled": file_resource_input.preview_enabled, "file_updated": file_resource_input.file is not None, - "preview_details_updated": file_resource_input.preview_details is not None, + "preview_details_updated": file_resource_input.preview_details + is not None, }, }, ) @@ -314,7 +332,9 @@ def update_file_resource( try: resource = Resource.objects.get(id=file_resource_input.id) except Resource.DoesNotExist as e: - raise ValueError(f"Resource with ID {file_resource_input.id} does not exist.") + raise ValueError( + f"Resource with ID {file_resource_input.id} does not exist." + ) if file_resource_input.name: resource.name = file_resource_input.name @@ -327,12 +347,12 @@ def update_file_resource( file_details = getattr(resource, "resourcefiledetails", None) if file_details: file_details.file = file_resource_input.file - file_details.size = file_resource_input.file.size # type: ignore[attr-defined] + file_details.size = file_resource_input.file.size file_details.save() else: ResourceFileDetails.objects.create( file=file_resource_input.file, - size=file_resource_input.file.size, # type: ignore[attr-defined] + size=file_resource_input.file.size, resource=resource, ) _validate_file_details_and_update_format(resource) @@ -344,7 +364,9 @@ def update_file_resource( return TypeResource.from_django(resource) @strawberry_django.mutation(handle_django_errors=True) - @trace_resolver(name="update_file_resource_schema", attributes={"component": "resource"}) + @trace_resolver( + name="update_file_resource_schema", attributes={"component": "resource"} + ) def update_file_resource_schema( self, info: Info, schema_update_input: SchemaUpdateInput ) -> TypeResource: @@ -352,14 +374,20 @@ def update_file_resource_schema( try: resource = Resource.objects.get(id=schema_update_input.resource) except Resource.DoesNotExist as e: - raise ValueError(f"Resource with ID {schema_update_input.resource} does not exist.") + raise ValueError( + f"Resource with ID {schema_update_input.resource} does not exist." + ) _update_file_resource_schema(resource, schema_update_input.updates) return TypeResource.from_django(resource) @strawberry_django.mutation(handle_django_errors=True) - @trace_resolver(name="reset_file_resource_schema", attributes={"component": "resource"}) - def reset_file_resource_schema(self, info: Info, resource_id: uuid.UUID) -> TypeResource: + @trace_resolver( + name="reset_file_resource_schema", attributes={"component": "resource"} + ) + def reset_file_resource_schema( + self, info: Info, resource_id: uuid.UUID + ) -> TypeResource: """Reset file resource schema.""" try: resource = Resource.objects.get(id=resource_id) @@ -406,7 +434,9 @@ def delete_file_resource(self, info: Info, resource_id: uuid.UUID) -> bool: ], ) @trace_resolver(name="create_major_version", attributes={"component": "resource"}) - def create_major_version(self, info: Info, input: CreateMajorVersionInput) -> TypeResource: + def create_major_version( + self, info: Info, input: CreateMajorVersionInput + ) -> TypeResource: """Create a major version for a resource. This should be used when significant changes are made to the resource data structure, @@ -432,7 +462,9 @@ def create_major_version(self, info: Info, input: CreateMajorVersionInput) -> Ty new_version = "v1.0.0" else: # Increment major version - new_version = _increment_version(last_version.version_number, increment_type="major") + new_version = _increment_version( + last_version.version_number, increment_type="major" + ) # Initialize DVC manager dvc = DVCManager(settings.DVC_REPO_PATH) diff --git a/api/schema/usecase_schema.py b/api/schema/usecase_schema.py index a4c905f9..e49b829b 100644 --- a/api/schema/usecase_schema.py +++ b/api/schema/usecase_schema.py @@ -1,7 +1,6 @@ """Schema definitions for use cases.""" # mypy: disable-error-code=operator -# mypy: disable-error-code=valid-type import datetime import uuid @@ -169,14 +168,18 @@ def published_use_cases( return TypeUseCase.from_django_list(results) @strawberry_django.field - @trace_resolver(name="get_datasets_by_use_case", attributes={"component": "usecase"}) + @trace_resolver( + name="get_datasets_by_use_case", attributes={"component": "usecase"} + ) def dataset_by_use_case(self, info: Info, use_case_id: str) -> list[TypeDataset]: """Get datasets by use case.""" queryset = Dataset.objects.filter(usecase__id=use_case_id) return TypeDataset.from_django_list(queryset) @strawberry_django.field - @trace_resolver(name="get_contributors_by_use_case", attributes={"component": "usecase"}) + @trace_resolver( + name="get_contributors_by_use_case", attributes={"component": "usecase"} + ) def contributors_by_use_case(self, info: Info, use_case_id: str) -> list[TypeUser]: """Get contributors by use case.""" try: @@ -191,7 +194,9 @@ def contributors_by_use_case(self, info: Info, use_case_id: str) -> list[TypeUse def _update_usecase_tags(usecase: UseCase, tags: List[str]) -> None: usecase.tags.clear() for tag in tags: - usecase.tags.add(Tag.objects.get_or_create(defaults={"value": tag}, value__iexact=tag)[0]) + usecase.tags.add( + Tag.objects.get_or_create(defaults={"value": tag}, value__iexact=tag)[0] + ) usecase.save() @@ -235,7 +240,9 @@ def _add_update_usecase_metadata( metadata_field = Metadata.objects.get(id=metadata_input_item.id) if not metadata_field.enabled: _delete_existing_metadata(usecase) - raise ValueError(f"Metadata with ID {metadata_input_item.id} is not enabled.") + raise ValueError( + f"Metadata with ID {metadata_input_item.id} is not enabled." + ) uc_metadata = UseCaseMetadata( usecase=usecase, metadata_item=metadata_field, @@ -244,7 +251,9 @@ def _add_update_usecase_metadata( uc_metadata.save() except Metadata.DoesNotExist: _delete_existing_metadata(usecase) - raise ValueError(f"Metadata with ID {metadata_input_item.id} does not exist.") + raise ValueError( + f"Metadata with ID {metadata_input_item.id} does not exist." + ) @trace_resolver(name="delete_existing_metadata", attributes={"component": "usecase"}) @@ -320,7 +329,10 @@ def add_use_case(self, info: Info) -> TypeUseCase: else None ), "sectors": ( - [str(sector_id) for sector_id in update_metadata_input.sectors] + [ + str(sector_id) + for sector_id in update_metadata_input.sectors + ] if update_metadata_input.sectors else [] ), @@ -383,7 +395,10 @@ def update_use_case(self, info: Info, data: UseCaseInputPartial) -> TypeUseCase: usecase.started_on = data.started_on if data.completed_on is not None and data.completed_on is not strawberry.UNSET: usecase.completed_on = data.completed_on - if data.running_status is not None and data.running_status is not strawberry.UNSET: + if ( + data.running_status is not None + and data.running_status is not strawberry.UNSET + ): usecase.running_status = data.running_status if data.logo is not None and data.logo is not strawberry.UNSET: usecase.logo = data.logo @@ -395,7 +410,9 @@ def update_use_case(self, info: Info, data: UseCaseInputPartial) -> TypeUseCase: extensions=[ TrackActivity( verb="deleted", - get_data=lambda info, use_case_id, **kwargs: {"usecase_id": use_case_id}, + get_data=lambda info, use_case_id, **kwargs: { + "usecase_id": use_case_id + }, ) ], ) @@ -593,7 +610,9 @@ def remove_contributor_from_use_case( get_data=lambda result, use_case_id, user_ids, **kwargs: { "usecase_id": use_case_id, "usecase_title": result.title, - "updated_fields": {"contributors": [str(user_id) for user_id in user_ids]}, + "updated_fields": { + "contributors": [str(user_id) for user_id in user_ids] + }, }, ) ], diff --git a/api/types/type_aimodel.py b/api/types/type_aimodel.py index 60c7c7b7..7870d8cb 100644 --- a/api/types/type_aimodel.py +++ b/api/types/type_aimodel.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - """GraphQL types for AI Model.""" import uuid diff --git a/api/types/type_collaborative.py b/api/types/type_collaborative.py index 89a3cf9e..11805ad4 100644 --- a/api/types/type_collaborative.py +++ b/api/types/type_collaborative.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - from typing import List, Optional import strawberry @@ -67,7 +65,9 @@ class TypeCollaborative(BaseType): description="URL of the platform where this collaborative is published" ) - @strawberry.field(description="Check if this collaborative is created by an individual user.") + @strawberry.field( + description="Check if this collaborative is created by an individual user." + ) def is_individual_collaborative(self) -> bool: """Check if this collaborative is created by an individual user.""" return self.organization is None @@ -106,7 +106,9 @@ def use_cases(self) -> Optional[List["TypeUseCase"]]: except Exception: return [] - @strawberry.field(description="Get the count of datasets associated with this collaborative.") + @strawberry.field( + description="Get the count of datasets associated with this collaborative." + ) def dataset_count(self: "TypeCollaborative", info: Info) -> int: """Get the count of datasets associated with this collaborative.""" try: @@ -114,7 +116,9 @@ def dataset_count(self: "TypeCollaborative", info: Info) -> int: except Exception: return 0 - @strawberry.field(description="Get the count of use cases associated with this collaborative.") + @strawberry.field( + description="Get the count of use cases associated with this collaborative." + ) def use_case_count(self: "TypeCollaborative", info: Info) -> int: """Get the count of use cases associated with this collaborative.""" try: @@ -177,7 +181,9 @@ def metadata(self) -> Optional[List["TypeCollaborativeMetadata"]]: except Exception: return [] - @strawberry.field(description="Get contributors associated with this collaborative.") + @strawberry.field( + description="Get contributors associated with this collaborative." + ) def contributors(self) -> Optional[List["TypeUser"]]: """Get contributors associated with this collaborative.""" try: @@ -203,7 +209,9 @@ def organization_relationships( except Exception: return [] - @strawberry.field(description="Get supporting organizations for this collaborative.") + @strawberry.field( + description="Get supporting organizations for this collaborative." + ) def supporting_organizations(self) -> Optional[List["TypeOrganization"]]: """Get supporting organizations for this collaborative.""" try: diff --git a/api/types/type_collaborative_organization.py b/api/types/type_collaborative_organization.py index e6cfecfe..662430f7 100644 --- a/api/types/type_collaborative_organization.py +++ b/api/types/type_collaborative_organization.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - """GraphQL type for UseCase-Organization relationship.""" from typing import Optional diff --git a/api/types/type_metadata.py b/api/types/type_metadata.py index 132cc598..304b110a 100644 --- a/api/types/type_metadata.py +++ b/api/types/type_metadata.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - from enum import Enum from typing import List, Optional diff --git a/api/types/type_resource_chart.py b/api/types/type_resource_chart.py index 51e4161b..45e62485 100644 --- a/api/types/type_resource_chart.py +++ b/api/types/type_resource_chart.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - import json import uuid from datetime import datetime @@ -316,7 +314,9 @@ def chart_options(self) -> Optional[ChartOptionsType]: point_size=options_dict.get("point_size"), # Geospatial Map Chart options geospatial_field=( - ensure_type(options_dict.get("geospatial_field"), TypeResourceSchema) + ensure_type( + options_dict.get("geospatial_field"), TypeResourceSchema + ) if options_dict.get("geospatial_field") else None ), @@ -332,7 +332,9 @@ def chart_filters(self) -> List[FilterType]: return [ FilterType( column=( - ensure_type(f["column"], TypeResourceSchema) if f.get("column") else None + ensure_type(f["column"], TypeResourceSchema) + if f.get("column") + else None ), operator=f["operator"], value=f["value"], @@ -350,7 +352,9 @@ def chart(self, info: Info) -> Optional[ChartConfig]: return None # Convert chart to JSON-serializable format - chart_options = chart_instance.dump_options_with_quotes() if chart_instance else None + chart_options = ( + chart_instance.dump_options_with_quotes() if chart_instance else None + ) if not chart_options: return None diff --git a/api/types/type_resource_chart_image.py b/api/types/type_resource_chart_image.py index b0d87623..d0cf6acb 100644 --- a/api/types/type_resource_chart_image.py +++ b/api/types/type_resource_chart_image.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - from typing import Optional import strawberry diff --git a/api/types/type_usecase.py b/api/types/type_usecase.py index 96fd0d92..821379ea 100644 --- a/api/types/type_usecase.py +++ b/api/types/type_usecase.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - from typing import List, Optional import strawberry @@ -66,7 +64,9 @@ class TypeUseCase(BaseType): description="URL of the platform where this use case is published" ) - @strawberry.field(description="Check if this use case is created by an individual user.") + @strawberry.field( + description="Check if this use case is created by an individual user." + ) def is_individual_usecase(self) -> bool: """Check if this use case is created by an individual user.""" return self.organization is None @@ -83,7 +83,9 @@ def datasets(self) -> Optional[List["TypeDataset"]]: except Exception: return [] - @strawberry.field(description="Get the count of datasets associated with this use case.") + @strawberry.field( + description="Get the count of datasets associated with this use case." + ) def dataset_count(self: "TypeUseCase", info: Info) -> int: """Get the count of datasets associated with this use case.""" try: @@ -168,7 +170,9 @@ def contributors(self) -> Optional[List["TypeUser"]]: except Exception: return [] - @strawberry.field(description="Get organization relationships associated with this use case.") + @strawberry.field( + description="Get organization relationships associated with this use case." + ) def organization_relationships( self, ) -> Optional[List["TypeUseCaseOrganizationRelationship"]]: @@ -215,7 +219,9 @@ def partner_organizations(self) -> Optional[List["TypeOrganization"]]: except Exception: return [] - @strawberry.field(description="Get Usecase dashboard associated with this use case.") + @strawberry.field( + description="Get Usecase dashboard associated with this use case." + ) def usecase_dashboard(self) -> Optional[List["TypeUseCaseDashboard"]]: """Get Usecase dashboard associated with this use case.""" try: diff --git a/api/types/type_usecase_organization.py b/api/types/type_usecase_organization.py index 2f7fd2f2..2430dc9e 100644 --- a/api/types/type_usecase_organization.py +++ b/api/types/type_usecase_organization.py @@ -1,5 +1,3 @@ -# mypy: disable-error-code="valid-type" - """GraphQL type for UseCase-Organization relationship.""" from typing import Optional diff --git a/api/urls.py b/api/urls.py index c0bb1d65..6d67c407 100644 --- a/api/urls.py +++ b/api/urls.py @@ -11,7 +11,6 @@ aimodel_execution, auditor, auth, - dataset_data, download, generate_dynamic_chart, search_aimodel, @@ -78,21 +77,6 @@ trending_datasets.TrendingDatasets.as_view(), name="trending_datasets", ), - path( - "resources//data/", - dataset_data.ResourceDataView.as_view(), - name="resource_data", - ), - path( - "datasets//data/", - dataset_data.DatasetDataView.as_view(), - name="dataset_data", - ), - path( - "datasets//prompts/", - dataset_data.PromptDatasetDataView.as_view(), - name="prompt_dataset_data", - ), # Single, simple GraphQL endpoint with no redirects path( "graphql", diff --git a/api/utils/data_indexing.py b/api/utils/data_indexing.py index 8e817b88..ce30f0ca 100644 --- a/api/utils/data_indexing.py +++ b/api/utils/data_indexing.py @@ -1,10 +1,9 @@ -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, Optional import pandas as pd import structlog from django.db import connections, transaction from django.db.utils import ProgrammingError -from psycopg2 import sql as pg_sql # type: ignore[import-untyped] from api.models.Resource import Resource, ResourceDataTable from api.models.ResourceSchema import ResourceSchema @@ -16,27 +15,6 @@ # Use a separate database for data tables DATA_DB = "data_db" # This should match the connection name in settings.py -# Allowed comparison operators for column-based filtering on indexed data. -# Maps operator suffix -> (sql_template_with_{ph}_placeholder, value_transformer) -_FILTER_OPERATORS: Dict[str, Tuple[str, Any]] = { - "eq": ("= %s", lambda v: v), - "ne": ("<> %s", lambda v: v), - "gt": ("> %s", lambda v: v), - "gte": (">= %s", lambda v: v), - "lt": ("< %s", lambda v: v), - "lte": ("<= %s", lambda v: v), - "in": ("= ANY(%s)", lambda v: list(v) if not isinstance(v, list) else v), - "nin": ("<> ALL(%s)", lambda v: list(v) if not isinstance(v, list) else v), - "contains": ("LIKE %s", lambda v: f"%{v}%"), - "icontains": ("ILIKE %s", lambda v: f"%{v}%"), - "startswith": ("LIKE %s", lambda v: f"{v}%"), - "istartswith": ("ILIKE %s", lambda v: f"{v}%"), - "endswith": ("LIKE %s", lambda v: f"%{v}"), - "iendswith": ("ILIKE %s", lambda v: f"%{v}"), - "isnull": ("IS NULL", None), # value ignored - "notnull": ("IS NOT NULL", None), -} - def get_sql_type(pandas_dtype: str) -> str: """Convert pandas dtype to SQL type.""" @@ -454,203 +432,3 @@ def get_preview_data(resource: Resource) -> Optional[PreviewData]: f"Error getting preview data for resource {resource.id}: {str(e)}, traceback: {traceback.format_exc()}" ) return None - - -# Maximum rows that can be returned in a single fetch_resource_data call -MAX_FETCH_LIMIT = 10000 -DEFAULT_FETCH_LIMIT = 100 - - -class DataFetchError(Exception): - """Raised when fetch_resource_data receives invalid input.""" - - -def get_resource_columns(resource: Resource) -> List[str]: - """Return the list of indexed column names for a resource. - - Falls back to inspecting the data_db table if no ResourceSchema rows exist. - """ - cols = list( - ResourceSchema.objects.filter(resource=resource).values_list("field_name", flat=True) - ) - if cols: - return cols - # Fallback: introspect the table directly - try: - data_table = ResourceDataTable.objects.get(resource=resource) - with connections[DATA_DB].cursor() as cursor: - cursor.execute( - "SELECT column_name FROM information_schema.columns " - "WHERE table_name = %s ORDER BY ordinal_position", - [data_table.table_name], - ) - return [row[0] for row in cursor.fetchall()] - except ResourceDataTable.DoesNotExist: - return [] - - -def _parse_filter_key(key: str) -> Tuple[str, str]: - """Split 'col__op' style filter key into (column, op). Defaults op to 'eq'.""" - if "__" in key: - col, op = key.rsplit("__", 1) - if op not in _FILTER_OPERATORS: - # No valid operator suffix — treat full key as column with eq - return key, "eq" - return col, op - return key, "eq" - - -def _build_where_clause( - filters: Dict[str, Any], allowed_columns: List[str] -) -> Tuple[pg_sql.Composable, List[Any]]: - """Build a parameterized WHERE clause from a filters dict. - - Filters are of the form ``{"column": value}`` for equality, or - ``{"column__op": value}`` for other operators. Unknown columns are rejected. - """ - if not filters: - return pg_sql.SQL(""), [] - - allowed_set = set(allowed_columns) - clauses: List[pg_sql.Composable] = [] - params: List[Any] = [] - - for raw_key, value in filters.items(): - col, op = _parse_filter_key(raw_key) - if col not in allowed_set: - raise DataFetchError(f"Unknown filter column: {col}") - op_template, transformer = _FILTER_OPERATORS[op] - - col_ident = pg_sql.Identifier(col) - if op in ("isnull", "notnull"): - # Boolean toggle: isnull=true means IS NULL, isnull=false means IS NOT NULL - truthy = value not in (False, "false", "False", 0, "0", None) - sql_op = "IS NULL" if (op == "isnull") == truthy else "IS NOT NULL" - clauses.append(pg_sql.SQL("{col} {op}").format(col=col_ident, op=pg_sql.SQL(sql_op))) - continue - - # Compose: (where op_template contains %s placeholders) - clauses.append(pg_sql.SQL("{col} ").format(col=col_ident) + pg_sql.SQL(op_template)) - params.append(transformer(value) if transformer else value) - - where_sql = pg_sql.SQL(" WHERE ") + pg_sql.SQL(" AND ").join(clauses) - return where_sql, params - - -def _build_order_by(order_by: Optional[List[str]], allowed_columns: List[str]) -> pg_sql.Composable: - """Build a parameterised ORDER BY clause. Each entry may be 'col' or '-col'.""" - if not order_by: - return pg_sql.SQL("") - allowed_set = set(allowed_columns) - parts: List[pg_sql.Composable] = [] - for item in order_by: - direction = "ASC" - col = item - if item.startswith("-"): - direction = "DESC" - col = item[1:] - elif item.startswith("+"): - col = item[1:] - if col not in allowed_set: - raise DataFetchError(f"Unknown order_by column: {col}") - parts.append( - pg_sql.SQL("{col} ").format(col=pg_sql.Identifier(col)) + pg_sql.SQL(direction) - ) - return pg_sql.SQL(" ORDER BY ") + pg_sql.SQL(", ").join(parts) - - -def fetch_resource_data( - resource: Resource, - filters: Optional[Dict[str, Any]] = None, - columns: Optional[List[str]] = None, - limit: int = DEFAULT_FETCH_LIMIT, - offset: int = 0, - order_by: Optional[List[str]] = None, - count: bool = True, -) -> Dict[str, Any]: - """Fetch indexed data for a Resource from data_db with column-level filtering. - - Returns a dict:: - - { - "columns": [...], # selected column names - "rows": [[...], ...], # list of rows (one list per row) - "total": , # total matching rows (None if count=False) - "limit": , - "offset": , - } - - Args: - resource: The Resource whose indexed data should be fetched. - filters: Optional dict of ``{"col": val}`` or ``{"col__op": val}`` filters. - columns: Optional list of columns to project. Defaults to all columns. - limit: Max rows to return (capped at MAX_FETCH_LIMIT). - offset: Number of rows to skip. - order_by: Optional list of columns; prefix with ``-`` for DESC. - count: When True (default) also returns the total matching row count. - - Raises: - DataFetchError: If the resource has no indexed data, or filters/columns - reference unknown columns. - """ - try: - data_table = ResourceDataTable.objects.get(resource=resource) - except ResourceDataTable.DoesNotExist: - raise DataFetchError(f"Resource {resource.id} has no indexed data table") - - allowed_columns = get_resource_columns(resource) - if not allowed_columns: - raise DataFetchError(f"Resource {resource.id} has no schema/columns available") - - # Validate and resolve projected columns - if columns: - unknown = [c for c in columns if c not in allowed_columns] - if unknown: - raise DataFetchError(f"Unknown columns: {unknown}") - select_columns = columns - else: - select_columns = allowed_columns - - # Clamp pagination - if limit is None or limit <= 0: - limit = DEFAULT_FETCH_LIMIT - limit = min(int(limit), MAX_FETCH_LIMIT) - offset = max(int(offset or 0), 0) - - table_ident = pg_sql.Identifier(data_table.table_name) - cols_sql = pg_sql.SQL(", ").join(pg_sql.Identifier(c) for c in select_columns) - where_sql, params = _build_where_clause(filters or {}, allowed_columns) - order_sql = _build_order_by(order_by, allowed_columns) - - select_query = ( - pg_sql.SQL("SELECT ") - + cols_sql - + pg_sql.SQL(" FROM ") - + table_ident - + where_sql - + order_sql - + pg_sql.SQL(" LIMIT %s OFFSET %s") - ) - - total: Optional[int] = None - with connections[DATA_DB].cursor() as cursor: - # Safety: cap query time - cursor.execute("SET statement_timeout = 10000") - - if count: - count_query = pg_sql.SQL("SELECT COUNT(*) FROM ") + table_ident + where_sql - cursor.execute(count_query, params) - row = cursor.fetchone() - total = int(row[0]) if row else 0 - - cursor.execute(select_query, params + [limit, offset]) - result_columns = [desc[0] for desc in cursor.description] - rows = [list(r) for r in cursor.fetchall()] - - return { - "columns": result_columns, - "rows": rows, - "total": total, - "limit": limit, - "offset": offset, - } diff --git a/api/utils/keycloak_utils.py b/api/utils/keycloak_utils.py index f6937d71..15cd6fda 100644 --- a/api/utils/keycloak_utils.py +++ b/api/utils/keycloak_utils.py @@ -117,11 +117,7 @@ def validate_token(self, token: str) -> Dict[str, Any]: # If that fails (403), fall back to token introspection data try: user_info = self.keycloak_openid.userinfo(token) - if isinstance(user_info, bytes): - import json - - user_info = json.loads(user_info.decode("utf-8")) - return user_info # type: ignore[return-value] + return user_info except KeycloakError as userinfo_error: # If userinfo fails (e.g., 403), extract user info from token introspection logger.warning( diff --git a/api/views/dataset_data.py b/api/views/dataset_data.py deleted file mode 100644 index f6f0b6a7..00000000 --- a/api/views/dataset_data.py +++ /dev/null @@ -1,359 +0,0 @@ -"""HTTP endpoints for fetching indexed dataset/resource data from data_db. - -Endpoints: - -- ``GET /api/resources//data/`` — fetch indexed data for a single - resource with column-based filtering. -- ``GET /api/datasets//data/`` — fetch indexed data for a dataset. - By default operates on the dataset's first indexed resource. Pass - ``?resource_id=`` to target a specific resource. -- ``GET /api/datasets//prompts/`` — fetch indexed data for a - PromptDataset, restricted to ``dataset_type=PROMPT`` and exposing extra - prompt-specific filter shorthands. - -All endpoints accept these query params: - -- ``columns`` — comma-separated list of columns to project. -- ``limit`` (default 100, max 10000), ``offset`` (default 0). -- ``order_by`` — comma-separated columns; prefix with ``-`` for DESC. -- ``count`` — ``true``/``false`` (default ``true``) to include total row count. -- Any other query param is interpreted as a data-column filter, optionally - with operator suffix, e.g. ``?price__gte=10&category=books``. Repeated keys - produce a list (used naturally for ``__in``/``__nin``). -""" - -import uuid -from typing import Any, Dict, List, Optional, Tuple - -import structlog -from django.http import HttpRequest -from rest_framework.permissions import AllowAny -from rest_framework.request import Request -from rest_framework.response import Response -from rest_framework.views import APIView - -from api.models import Dataset, Resource, ResourceDataTable -from api.models.PromptDataset import PromptDataset -from api.utils.data_indexing import ( - DEFAULT_FETCH_LIMIT, - MAX_FETCH_LIMIT, - DataFetchError, - fetch_resource_data, - get_resource_columns, -) -from api.utils.enums import DatasetStatus, DatasetType - -logger = structlog.get_logger(__name__) - -# Reserved query parameters that are NOT treated as column filters. -_RESERVED_PARAMS = { - "columns", - "limit", - "offset", - "order_by", - "count", - "resource_id", - "format", -} - - -def _parse_bool(value: Any, default: bool = False) -> bool: - if value is None: - return default - if isinstance(value, bool): - return value - return str(value).strip().lower() in {"1", "true", "yes", "y", "on"} - - -def _parse_int(value: Any, default: int) -> int: - try: - return int(value) - except (TypeError, ValueError): - return default - - -def _parse_csv(value: Optional[str]) -> Optional[List[str]]: - if not value: - return None - parts = [p.strip() for p in value.split(",") if p.strip()] - return parts or None - - -def _extract_filters(query_params: Any, reserved: Optional[set] = None) -> Dict[str, Any]: - """Pull non-reserved query params as filter dict. - - Repeated keys collapse into lists so callers can use - ``?col__in=a&col__in=b``. ``__in``/``__nin`` always produce a list, even - for a single value. - """ - reserved_set = reserved if reserved is not None else _RESERVED_PARAMS - filters: Dict[str, Any] = {} - # query_params is a QueryDict; use .lists() if available - if hasattr(query_params, "lists"): - items = query_params.lists() - else: - items = [(k, [v]) for k, v in query_params.items()] - - for key, values in items: - if key in reserved_set: - continue - if not values: - continue - op_suffix = key.rsplit("__", 1)[-1] if "__" in key else None - if op_suffix in ("in", "nin"): - # Allow comma-separated single value too - collected: List[Any] = [] - for v in values: - if isinstance(v, str) and "," in v: - collected.extend([p for p in (s.strip() for s in v.split(",")) if p]) - else: - collected.append(v) - filters[key] = collected - else: - # Last value wins for non-list operators - filters[key] = values[-1] - return filters - - -def _user_can_access_dataset(request: HttpRequest, dataset: Dataset) -> bool: - """Allow access to PUBLISHED datasets, otherwise require owner/org-member.""" - if dataset.status == DatasetStatus.PUBLISHED.value: - return True - user = getattr(request, "user", None) - if not user or not user.is_authenticated: - return False - if user.is_superuser: - return True - if dataset.user_id and dataset.user_id == user.id: - return True - if dataset.organization_id: - # Lazy import to avoid circular imports at module load - from authorization.models import OrganizationMembership - - return OrganizationMembership.objects.filter( - user=user, organization_id=dataset.organization_id - ).exists() - return False - - -def _resolve_dataset_resource( - dataset: Dataset, resource_id: Optional[str] -) -> Tuple[Optional[Resource], Optional[Response]]: - """Pick a Resource for a dataset-level data fetch. - - Returns ``(resource, error_response)`` — exactly one is non-None. - """ - if resource_id: - try: - resource = dataset.resources.get(id=resource_id) - except Resource.DoesNotExist: - return None, Response( - {"error": f"Resource {resource_id} not found in dataset {dataset.id}"}, - status=404, - ) - return resource, None - - # Default: first resource that has indexed data - indexed_table = ( - ResourceDataTable.objects.filter(resource__dataset=dataset).order_by("created").first() - ) - if indexed_table is None: - return None, Response( - { - "error": ( - "Dataset has no indexed (tabular) resources. " - "Pass ?resource_id= or upload a CSV/XLSX/Parquet/JSON file." - ) - }, - status=404, - ) - return indexed_table.resource, None - - -def _fetch_and_respond( - request: Request, - resource: Resource, - extra_filters: Optional[Dict[str, Any]] = None, - reserved: Optional[set] = None, - extra_response: Optional[Dict[str, Any]] = None, -) -> Response: - """Common path: parse query params, run fetch_resource_data, return JSON.""" - qp = request.query_params # type: ignore[attr-defined] - - columns = _parse_csv(qp.get("columns")) - order_by = _parse_csv(qp.get("order_by")) - limit = _parse_int(qp.get("limit"), DEFAULT_FETCH_LIMIT) - offset = _parse_int(qp.get("offset"), 0) - count = _parse_bool(qp.get("count"), default=True) - - filters = _extract_filters(qp, reserved=reserved) - if extra_filters: - filters.update(extra_filters) - - try: - result = fetch_resource_data( - resource=resource, - filters=filters, - columns=columns, - limit=limit, - offset=offset, - order_by=order_by, - count=count, - ) - except DataFetchError as e: - return Response({"error": str(e)}, status=400) - except Exception as e: # pragma: no cover — defensive - logger.exception( - "fetch_resource_data failed", - resource_id=str(resource.id), - error=str(e), - ) - return Response({"error": "Failed to fetch data"}, status=500) - - available = get_resource_columns(resource) - - payload: Dict[str, Any] = { - "resource_id": str(resource.id), - "dataset_id": str(resource.dataset_id), - "available_columns": available, - "max_limit": MAX_FETCH_LIMIT, - **result, - } - if extra_response: - payload.update(extra_response) - return Response(payload) - - -class ResourceDataView(APIView): - """Return indexed data for a specific resource.""" - - permission_classes = [AllowAny] - - def get(self, request: Request, resource_id: uuid.UUID) -> Response: - try: - resource = Resource.objects.select_related("dataset").get(id=resource_id) - except Resource.DoesNotExist: - return Response({"error": "Resource not found"}, status=404) - - if not _user_can_access_dataset(request, resource.dataset): # type: ignore[attr-defined] - return Response({"error": "Not authorized"}, status=403) - - return _fetch_and_respond(request, resource) - - -class DatasetDataView(APIView): - """Return indexed data for a dataset (one resource at a time).""" - - permission_classes = [AllowAny] - - def get(self, request: Request, dataset_id: uuid.UUID) -> Response: - try: - dataset = Dataset.objects.get(id=dataset_id) - except Dataset.DoesNotExist: - return Response({"error": "Dataset not found"}, status=404) - - if not _user_can_access_dataset(request, dataset): - return Response({"error": "Not authorized"}, status=403) - - resource_id = request.query_params.get("resource_id") # type: ignore[attr-defined] - resource, err = _resolve_dataset_resource(dataset, resource_id) - if err is not None: - return err - assert resource is not None - return _fetch_and_respond(request, resource) - - -class PromptDatasetDataView(APIView): - """Return indexed data for a PromptDataset. - - Same query semantics as :class:`DatasetDataView`, but the dataset must be - of type ``PROMPT``. Convenience query params (translated to column - filters when those columns exist on the data): - - - ``prompt_contains`` -> ``prompt__icontains`` - - ``response_contains`` -> ``response__icontains`` (or ``completion``) - - ``min_length``/``max_length`` -> ``length__gte``/``length__lte`` - """ - - permission_classes = [AllowAny] - - # Conventional column names we look for on prompt data tables. - _PROMPT_COL_CANDIDATES = ("prompt", "input", "instruction", "question") - _RESPONSE_COL_CANDIDATES = ("response", "completion", "answer", "output") - _LENGTH_COL_CANDIDATES = ("length", "prompt_length", "tokens", "token_count") - - def _first_present(self, available: List[str], candidates: Tuple[str, ...]) -> Optional[str]: - lower_map = {c.lower(): c for c in available} - for cand in candidates: - if cand in lower_map: - return lower_map[cand] - return None - - def get(self, request: Request, dataset_id: uuid.UUID) -> Response: - try: - prompt_dataset = PromptDataset.objects.get(dataset_ptr_id=dataset_id) - except PromptDataset.DoesNotExist: - return Response( - {"error": f"Dataset {dataset_id} is not a prompt dataset"}, - status=404, - ) - - if prompt_dataset.dataset_type != DatasetType.PROMPT.value: - return Response( - {"error": f"Dataset {dataset_id} is not a prompt dataset"}, - status=400, - ) - - if not _user_can_access_dataset(request, prompt_dataset): - return Response({"error": "Not authorized"}, status=403) - - resource_id = request.query_params.get("resource_id") # type: ignore[attr-defined] - resource, err = _resolve_dataset_resource(prompt_dataset, resource_id) - if err is not None: - return err - assert resource is not None - - # Map prompt-specific shorthands to underlying column filters - available = get_resource_columns(resource) - qp = request.query_params # type: ignore[attr-defined] - extra: Dict[str, Any] = {} - - prompt_col = self._first_present(available, self._PROMPT_COL_CANDIDATES) - response_col = self._first_present(available, self._RESPONSE_COL_CANDIDATES) - length_col = self._first_present(available, self._LENGTH_COL_CANDIDATES) - - prompt_q = qp.get("prompt_contains") - if prompt_q and prompt_col: - extra[f"{prompt_col}__icontains"] = prompt_q - - response_q = qp.get("response_contains") - if response_q and response_col: - extra[f"{response_col}__icontains"] = response_q - - min_len = qp.get("min_length") - if min_len and length_col: - extra[f"{length_col}__gte"] = min_len - - max_len = qp.get("max_length") - if max_len and length_col: - extra[f"{length_col}__lte"] = max_len - - local_reserved = _RESERVED_PARAMS | { - "prompt_contains", - "response_contains", - "min_length", - "max_length", - } - - return _fetch_and_respond( - request, - resource, - extra_filters=extra, - reserved=local_reserved, - extra_response={ - "dataset_type": prompt_dataset.dataset_type, - "prompt_column": prompt_col, - "response_column": response_col, - "length_column": length_col, - }, - ) diff --git a/dataspace_sdk/__version__.py b/dataspace_sdk/__version__.py index 8d063328..cff36390 100644 --- a/dataspace_sdk/__version__.py +++ b/dataspace_sdk/__version__.py @@ -1,3 +1,3 @@ """Version information for DataSpace SDK.""" -__version__ = "0.5.02" +__version__ = "0.4.19" diff --git a/dataspace_sdk/resources/datasets.py b/dataspace_sdk/resources/datasets.py index df6313e3..24dee57a 100644 --- a/dataspace_sdk/resources/datasets.py +++ b/dataspace_sdk/resources/datasets.py @@ -1,6 +1,6 @@ """Dataset resource client for DataSpace SDK.""" -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, List, Optional from dataspace_sdk.base import BaseAPIClient @@ -169,7 +169,6 @@ def list_all( license created updated - datasetType organization { id name @@ -178,31 +177,6 @@ def list_all( id value } - sectors { - id - name - } - promptMetadata - resources { - id - name - noOfEntries - fileDetails { - format - size - } - schema { - format - description - fieldName - } - promptDetails { - promptFormat - hasSystemPrompt - hasExampleResponses - promptCount - } - } } } """ @@ -631,176 +605,3 @@ def update_prompt_metadata( result: Dict[str, Any] = response.get("data", {}).get("updatePromptMetadata", {}) return result - - # ------------------------------------------------------------------ - # Indexed data access (data_db) - # ------------------------------------------------------------------ - - @staticmethod - def _build_data_params( - filters: Optional[Dict[str, Any]], - columns: Optional[List[str]], - order_by: Optional[List[str]], - limit: int, - offset: int, - count: Optional[bool], - ) -> Dict[str, Any]: - """Translate Pythonic kwargs into the ``GET /data/`` query-string form. - - ``filters`` is a flat dict using the same ``col`` / ``col__op`` keys as - the server. List values are passed through (requests will emit one - ``key=v`` pair per entry, used by ``__in`` / ``__nin``). - """ - params: Dict[str, Any] = {"limit": int(limit), "offset": int(offset)} - if columns: - params["columns"] = ",".join(columns) - if order_by: - params["order_by"] = ",".join(order_by) - if count is not None: - params["count"] = "true" if count else "false" - if filters: - for k, v in filters.items(): - if isinstance(v, (list, tuple)): - params[k] = list(v) - elif isinstance(v, bool): - params[k] = "true" if v else "false" - else: - params[k] = v - return params - - def get_resource_data( - self, - resource_id: str, - filters: Optional[Dict[str, Any]] = None, - columns: Optional[List[str]] = None, - order_by: Optional[List[str]] = None, - limit: int = 100, - offset: int = 0, - count: bool = True, - ) -> Dict[str, Any]: - """Fetch indexed (saved in ``data_db``) data for a single resource. - - Args: - resource_id: UUID of the resource (must have an indexed table). - filters: Column-level filters. Keys are either ``"col"`` (equality) - or ``"col__op"`` where op is one of: ``eq, ne, gt, gte, lt, - lte, in, nin, contains, icontains, startswith, istartswith, - endswith, iendswith, isnull, notnull``. - columns: Subset of columns to project. ``None`` returns all. - order_by: Columns to sort by. Prefix with ``-`` for DESC. - limit: Max rows to return (server caps at 10000). - offset: Number of rows to skip. - count: If ``True``, the response includes total matching row count. - - Returns: - A dict with ``columns``, ``rows``, ``total``, ``limit``, - ``offset``, ``available_columns``, ``resource_id``, - ``dataset_id``, and ``max_limit``. - """ - params = self._build_data_params(filters, columns, order_by, limit, offset, count) - return self.get(f"/api/resources/{resource_id}/data/", params=params) - - def get_dataset_data( - self, - dataset_id: str, - resource_id: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, - columns: Optional[List[str]] = None, - order_by: Optional[List[str]] = None, - limit: int = 100, - offset: int = 0, - count: bool = True, - ) -> Dict[str, Any]: - """Fetch indexed data for a dataset. - - By default operates on the dataset's first indexed (tabular) resource. - Pass ``resource_id`` to target a specific resource within the dataset. - Filtering / column / ordering semantics are identical to - :meth:`get_resource_data`. - """ - params = self._build_data_params(filters, columns, order_by, limit, offset, count) - if resource_id: - params["resource_id"] = resource_id - return self.get(f"/api/datasets/{dataset_id}/data/", params=params) - - def get_prompt_data( - self, - dataset_id: str, - resource_id: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, - columns: Optional[List[str]] = None, - order_by: Optional[List[str]] = None, - limit: int = 100, - offset: int = 0, - count: bool = True, - prompt_contains: Optional[str] = None, - response_contains: Optional[str] = None, - min_length: Optional[int] = None, - max_length: Optional[int] = None, - ) -> Dict[str, Any]: - """Fetch indexed data for a PROMPT-typed dataset. - - Same generic semantics as :meth:`get_dataset_data`, plus prompt-aware - shorthands that automatically map to the underlying prompt/response/ - length columns when present: - - Args: - prompt_contains: Substring (case-insensitive) match on the prompt - column (auto-detects ``prompt``/``input``/``instruction``/ - ``question``). - response_contains: Substring match on the response column - (auto-detects ``response``/``completion``/``answer``/ - ``output``). - min_length / max_length: Bounds on the length column - (auto-detects ``length``/``prompt_length``/``tokens``/ - ``token_count``). - - The response includes ``prompt_column``, ``response_column``, and - ``length_column`` indicating what was auto-detected. - """ - params = self._build_data_params(filters, columns, order_by, limit, offset, count) - if resource_id: - params["resource_id"] = resource_id - if prompt_contains is not None: - params["prompt_contains"] = prompt_contains - if response_contains is not None: - params["response_contains"] = response_contains - if min_length is not None: - params["min_length"] = int(min_length) - if max_length is not None: - params["max_length"] = int(max_length) - return self.get(f"/api/datasets/{dataset_id}/prompts/", params=params) - - def iter_resource_data( - self, - resource_id: str, - filters: Optional[Dict[str, Any]] = None, - columns: Optional[List[str]] = None, - order_by: Optional[List[str]] = None, - batch_size: int = 1000, - ) -> Iterator[Dict[str, Any]]: - """Yield rows as dicts, paging through the entire filtered result set. - - Each yielded item is a ``{column: value}`` mapping. ``batch_size`` is - capped at 10000 by the server. - """ - offset = 0 - while True: - page = self.get_resource_data( - resource_id=resource_id, - filters=filters, - columns=columns, - order_by=order_by, - limit=batch_size, - offset=offset, - count=False, - ) - cols: List[str] = page.get("columns", []) or [] - rows: List[List[Any]] = page.get("rows", []) or [] - if not rows: - return - for row in rows: - yield dict(zip(cols, row)) - if len(rows) < batch_size: - return - offset += len(rows) diff --git a/docs/dataset_data_api.md b/docs/dataset_data_api.md deleted file mode 100644 index 5e970ea9..00000000 --- a/docs/dataset_data_api.md +++ /dev/null @@ -1,266 +0,0 @@ -# Indexed Dataset Data API - -This document describes the HTTP endpoints and SDK methods for fetching the -*indexed tabular data* that DataSpace stores in the `data_db` PostgreSQL -database. When a CSV / XLSX / Parquet / JSON resource is uploaded, its rows are -indexed into a per-resource table so they can be queried, filtered, and -streamed without re-downloading the source file. - -## Overview - -| Layer | Surface | -|-------|---------| -| Backend utility | `api.utils.data_indexing.fetch_resource_data(...)` | -| HTTP API | `GET /api/resources//data/`, `GET /api/datasets//data/`, `GET /api/datasets//prompts/` | -| Python SDK | `DatasetClient.get_resource_data(...)`, `get_dataset_data(...)`, `get_prompt_data(...)`, `iter_resource_data(...)` | - -All three endpoints share the same query-parameter contract. The prompt -endpoint adds prompt-specific shorthands. - -## Permissions - -- **PUBLISHED** datasets are publicly readable. -- **DRAFT / ARCHIVED** datasets require the requesting user to be the dataset - owner, a superuser, or a member of the dataset's organization. - -## HTTP API - -### `GET /api/resources//data/` - -Returns indexed data for a single resource. - -### `GET /api/datasets//data/` - -Returns indexed data for a dataset. Defaults to the dataset's first indexed -resource. Use `?resource_id=` to target a specific resource. - -### `GET /api/datasets//prompts/` - -Same semantics as `/data/`, but the dataset must be of `dataset_type=PROMPT` -and the response includes the auto-detected prompt / response / length column -names. Convenience filters: - -| Param | Maps to | -|-------|---------| -| `prompt_contains=` | `__icontains=` | -| `response_contains=` | `__icontains=` | -| `min_length=` | `__gte=` | -| `max_length=` | `__lte=` | - -Auto-detected columns (case-insensitive, first match wins): - -- prompt: `prompt`, `input`, `instruction`, `question` -- response: `response`, `completion`, `answer`, `output` -- length: `length`, `prompt_length`, `tokens`, `token_count` - -If a candidate column is not present in the resource schema, the corresponding -shorthand is silently ignored. You can always fall back to the explicit -`__` form. - -### Query parameters - -Reserved (not interpreted as filters): - -| Param | Default | Notes | -|-------|---------|-------| -| `columns` | all | Comma-separated list of columns to project. | -| `limit` | `100` | Capped at `10000`. | -| `offset` | `0` | | -| `order_by` | none | Comma-separated. Prefix with `-` for DESC. | -| `count` | `true` | Set `false` to skip the `SELECT COUNT(*)` round-trip. | -| `resource_id` | first indexed | Only on `/datasets//data/` and `/prompts/`. | - -Any other query param is treated as a column filter. - -### Filter operators - -Filters use Django-ORM-style suffixes: `?__=`. Without a -suffix, equality is assumed: `?=`. - -| Operator | SQL | Notes | -|----------|-----|-------| -| `eq` (default) | `=` | | -| `ne` | `<>` | | -| `gt`, `gte`, `lt`, `lte` | `>`, `>=`, `<`, `<=` | | -| `in` | `= ANY(...)` | Repeat the param: `?col__in=a&col__in=b` (or `?col__in=a,b`). | -| `nin` | `<> ALL(...)` | Same shape as `in`. | -| `contains` / `icontains` | `LIKE` / `ILIKE` `'%v%'` | | -| `startswith` / `istartswith` | `LIKE` / `ILIKE` `'v%'` | | -| `endswith` / `iendswith` | `LIKE` / `ILIKE` `'%v'` | | -| `isnull` | `IS NULL` (truthy) / `IS NOT NULL` (falsy) | Value is parsed as bool. | -| `notnull` | inverse of `isnull` | | - -Unknown columns or unknown operators return HTTP **400** with a -`{"error": "..."}` body. All identifiers are quoted via `psycopg2.sql`; values -are bound as parameters — there is no string concatenation into the SQL. - -### Response shape - -```json -{ - "resource_id": "f1e2...", - "dataset_id": "abcd...", - "available_columns": ["id", "name", "price", "category"], - "max_limit": 10000, - "columns": ["id", "name"], - "rows": [[1, "alpha"], [2, "beta"]], - "total": 87, - "limit": 100, - "offset": 0 -} -``` - -The prompt endpoint additionally returns: - -```json -{ - "dataset_type": "PROMPT", - "prompt_column": "prompt", - "response_column": "response", - "length_column": "tokens" -} -``` - -Set `?count=false` to avoid the count query for large tables; `total` will be -`null`. - -### Examples - -```bash -# Books over $10, sorted by descending price, page 2 -curl "https://api.example.com/api/resources//data/?\ -category=books&price__gte=10&order_by=-price&limit=50&offset=50" - -# Multiple categories -curl "https://api.example.com/api/resources//data/?\ -category__in=books&category__in=media" - -# Prompt dataset: long English translation prompts -curl "https://api.example.com/api/datasets//prompts/?\ -prompt_contains=translate&min_length=50&language=en" -``` - -## Python SDK - -```python -from dataspace_sdk import DataSpaceClient - -client = DataSpaceClient( - base_url="https://dataspace.civicdatalab.in", - keycloak_url="https://opub-kc.civicdatalab.in", - keycloak_realm="DataSpace", - keycloak_client_id="dataspace", -) -client.login(username="...", password="...") -``` - -### `get_resource_data` - -```python -page = client.datasets.get_resource_data( - resource_id="f1e2...", - filters={ - "price__gte": 10, - "category__in": ["books", "media"], - "is_active": True, - }, - columns=["id", "title", "price"], - order_by=["-price", "title"], - limit=200, - offset=0, - count=True, -) -print(page["total"], len(page["rows"])) -``` - -### `get_dataset_data` - -Same parameters as `get_resource_data`, plus an optional `resource_id`. -Without `resource_id`, the dataset's first indexed resource is used. - -```python -page = client.datasets.get_dataset_data( - dataset_id="abcd...", - resource_id="optional-uuid", - filters={"region": "south"}, -) -``` - -### `get_prompt_data` - -Adds prompt-aware shorthands on top of the generic interface: - -```python -page = client.datasets.get_prompt_data( - dataset_id="abcd...", - prompt_contains="translate", - response_contains="bonjour", - min_length=20, - max_length=400, - filters={"language": "fr"}, - columns=["prompt", "response", "tokens"], - order_by=["-tokens"], -) -print(page["prompt_column"], page["response_column"], page["length_column"]) -``` - -### `iter_resource_data` — streaming all rows - -Transparently pages through the entire filtered result set, yielding each row -as a `{column: value}` dict. The server caps `batch_size` at `10000`. - -```python -for row in client.datasets.iter_resource_data( - resource_id="f1e2...", - filters={"is_active": True}, - columns=["id", "title", "price"], - batch_size=2000, -): - process(row) -``` - -## Backend utility - -When you need to fetch indexed data from inside the Django process (e.g. a -GraphQL resolver or background task), call the underlying utility directly: - -```python -from api.models import Resource -from api.utils.data_indexing import fetch_resource_data, DataFetchError - -resource = Resource.objects.get(id=resource_id) -try: - result = fetch_resource_data( - resource=resource, - filters={"price__gte": 10}, - columns=["id", "title", "price"], - order_by=["-price"], - limit=100, - offset=0, - count=True, - ) -except DataFetchError as e: - # Unknown column / no indexed table / etc. - raise -``` - -The utility validates every column against `ResourceSchema` (or the live -`information_schema` if no schema rows exist) and uses parameterised queries -exclusively — passing a malicious column name returns `DataFetchError`, -never a SQL injection. - -## Safety notes - -- Identifiers are quoted via `psycopg2.sql.Identifier`; values are passed as - query parameters. There is no string interpolation of user input into SQL. -- `statement_timeout` is set to **10 seconds** on every fetch. -- `limit` is clamped to **10000** rows. Use `iter_resource_data` to stream - larger result sets. -- The `data_db` connection is read-only from this layer's perspective — the - utility never executes anything other than `SELECT` / `SET statement_timeout`. - -## Related - -- [SDK overview](sdk/OVERVIEW.md) -- [SDK quick start](sdk/QUICKSTART.md) -- [Unified search API](unified_search_api.md) diff --git a/docs/sdk/README.md b/docs/sdk/README.md index 37b72461..19a67473 100644 --- a/docs/sdk/README.md +++ b/docs/sdk/README.md @@ -206,50 +206,6 @@ org_datasets = client.datasets.get_organization_datasets( ) ``` -### Fetch Indexed Dataset Data (filterable) - -For datasets whose resources have been indexed into `data_db` (CSV/XLSX/etc.), -you can query the underlying rows with column-level filters, projection, and -ordering. See the dedicated guide: [dataset_data_api.md](../dataset_data_api.md). - -```python -# Per-resource fetch -page = client.datasets.get_resource_data( - resource_id="f1e2...", - filters={"price__gte": 10, "category__in": ["books", "media"]}, - columns=["id", "title", "price"], - order_by=["-price"], - limit=200, -) -print(page["total"], len(page["rows"])) - -# Per-dataset fetch (defaults to first indexed resource) -page = client.datasets.get_dataset_data( - dataset_id="abcd...", - filters={"region": "south"}, -) - -# Prompt datasets — extra prompt-aware shorthands -page = client.datasets.get_prompt_data( - dataset_id="abcd...", - prompt_contains="translate", - min_length=20, - filters={"language": "fr"}, -) - -# Stream all matching rows as dicts -for row in client.datasets.iter_resource_data( - resource_id="f1e2...", - filters={"is_active": True}, - batch_size=2000, -): - process(row) -``` - -Supported filter operators (Django-style suffixes): `eq, ne, gt, gte, lt, lte, -in, nin, contains, icontains, startswith, istartswith, endswith, iendswith, -isnull, notnull`. - ## Working with AI Models ### Search AI Models diff --git a/tests/test_data_indexing_filters.py b/tests/test_data_indexing_filters.py deleted file mode 100644 index e8db64ff..00000000 --- a/tests/test_data_indexing_filters.py +++ /dev/null @@ -1,174 +0,0 @@ -"""Unit tests for the SQL-builder / filter helpers in -``api.utils.data_indexing`` and the request-parsing helpers in -``api.views.dataset_data``. - -These tests deliberately avoid touching the actual ``data_db`` connection — -they validate the logic that turns user input into safe SQL fragments and -into normalised filter dicts. -""" - -import unittest - -from django.http import QueryDict -from psycopg2 import sql as pg_sql - -from api.utils.data_indexing import ( - DataFetchError, - _build_order_by, - _build_where_clause, - _parse_filter_key, -) -from api.views.dataset_data import _extract_filters, _parse_bool, _parse_int - - -def _render(composable: pg_sql.Composable) -> str: - """Stringify a Composable without needing a live DB connection. - - Walks the Composed tree and concatenates the literal strings of each - leaf (SQL/Identifier). Identifiers are rendered as ``"name"``. - """ - if isinstance(composable, pg_sql.SQL): - return composable.string - if isinstance(composable, pg_sql.Identifier): - # psycopg2 may store multiple components for schema-qualified idents - parts = ( - composable.strings - if hasattr(composable, "strings") - else (composable._wrapped if hasattr(composable, "_wrapped") else []) - ) - return ".".join(f'"{p}"' for p in parts) - if isinstance(composable, pg_sql.Composed): - return "".join(_render(c) for c in composable.seq) - if isinstance(composable, pg_sql.Placeholder): - return "%s" - return str(composable) - - -class TestParseFilterKey(unittest.TestCase): - def test_no_op_defaults_to_eq(self) -> None: - self.assertEqual(_parse_filter_key("price"), ("price", "eq")) - - def test_known_op_split(self) -> None: - self.assertEqual(_parse_filter_key("price__gte"), ("price", "gte")) - self.assertEqual(_parse_filter_key("name__icontains"), ("name", "icontains")) - - def test_unknown_op_treated_as_column(self) -> None: - # Column may legitimately contain "__" — if suffix isn't a known op, - # fall back to equality on the full key. - col, op = _parse_filter_key("weird__suffix") - self.assertEqual((col, op), ("weird__suffix", "eq")) - - -class TestBuildWhereClause(unittest.TestCase): - allowed = ["id", "price", "name", "active"] - - def test_empty_filters(self) -> None: - sql, params = _build_where_clause({}, self.allowed) - self.assertEqual(params, []) - self.assertEqual(_render(sql), "") - - def test_eq_and_gte(self) -> None: - sql, params = _build_where_clause({"price__gte": 10, "name": "abc"}, self.allowed) - rendered = _render(sql) - self.assertIn(" WHERE ", rendered) - self.assertIn('"price" >= %s', rendered) - self.assertIn('"name" = %s', rendered) - self.assertIn(10, params) - self.assertIn("abc", params) - - def test_in_operator_normalises_to_list(self) -> None: - sql, params = _build_where_clause({"id__in": ("a", "b")}, self.allowed) - rendered = _render(sql) - self.assertIn("= ANY(%s)", rendered) - self.assertEqual(params, [["a", "b"]]) - - def test_isnull_truthy(self) -> None: - sql, params = _build_where_clause({"name__isnull": True}, self.allowed) - rendered = _render(sql) - self.assertIn("IS NULL", rendered) - self.assertEqual(params, []) - - def test_isnull_false_means_not_null(self) -> None: - sql, _ = _build_where_clause({"name__isnull": "false"}, self.allowed) - self.assertIn("IS NOT NULL", _render(sql)) - - def test_unknown_column_rejected(self) -> None: - with self.assertRaises(DataFetchError): - _build_where_clause({"evil__gte": 1}, self.allowed) - - def test_icontains_wraps_value(self) -> None: - _, params = _build_where_clause({"name__icontains": "foo"}, self.allowed) - self.assertEqual(params, ["%foo%"]) - - def test_startswith_wraps_value(self) -> None: - _, params = _build_where_clause({"name__startswith": "foo"}, self.allowed) - self.assertEqual(params, ["foo%"]) - - -class TestBuildOrderBy(unittest.TestCase): - allowed = ["id", "price"] - - def test_none_returns_empty(self) -> None: - sql = _build_order_by(None, self.allowed) - self.assertEqual(_render(sql), "") - - def test_asc_and_desc(self) -> None: - sql = _build_order_by(["-price", "id"], self.allowed) - rendered = _render(sql) - self.assertIn(" ORDER BY ", rendered) - self.assertIn('"price" DESC', rendered) - self.assertIn('"id" ASC', rendered) - - def test_unknown_column_rejected(self) -> None: - with self.assertRaises(DataFetchError): - _build_order_by(["evil"], self.allowed) - - -class TestViewQueryParamHelpers(unittest.TestCase): - def test_parse_bool(self) -> None: - self.assertTrue(_parse_bool("true")) - self.assertTrue(_parse_bool("YES")) - self.assertTrue(_parse_bool(True)) - self.assertFalse(_parse_bool("0")) - self.assertFalse(_parse_bool(None, default=False)) - self.assertTrue(_parse_bool(None, default=True)) - - def test_parse_int(self) -> None: - self.assertEqual(_parse_int("42", 0), 42) - self.assertEqual(_parse_int(None, 7), 7) - self.assertEqual(_parse_int("not-a-number", 9), 9) - - def test_extract_filters_skips_reserved(self) -> None: - qd = QueryDict(mutable=True) - qd.update({"limit": "10", "offset": "0", "columns": "a,b"}) - qd["price__gte"] = "5" - qd["name"] = "abc" - result = _extract_filters(qd) - self.assertEqual(result, {"price__gte": "5", "name": "abc"}) - - def test_extract_filters_in_collapses_to_list(self) -> None: - qd = QueryDict("col__in=a&col__in=b&col__in=c,d") - result = _extract_filters(qd) - self.assertIn("col__in", result) - self.assertEqual(sorted(result["col__in"]), ["a", "b", "c", "d"]) - - def test_extract_filters_custom_reserved(self) -> None: - qd = QueryDict("limit=10&prompt_contains=x&col=y") - result = _extract_filters( - qd, - reserved={ - "limit", - "offset", - "columns", - "order_by", - "count", - "resource_id", - "format", - "prompt_contains", - }, - ) - self.assertEqual(result, {"col": "y"}) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 7155fa8e..74653975 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -140,117 +140,5 @@ def test_search_with_sorting(self, mock_request: MagicMock) -> None: mock_request.assert_called_once() -class TestDatasetClientDataFetch(unittest.TestCase): - """Tests for indexed-data fetch methods on DatasetClient.""" - - def setUp(self) -> None: - self.client = DatasetClient("https://api.test.com", MagicMock()) - - def test_build_data_params_basic(self) -> None: - params = DatasetClient._build_data_params( - filters=None, - columns=None, - order_by=None, - limit=50, - offset=10, - count=True, - ) - self.assertEqual(params["limit"], 50) - self.assertEqual(params["offset"], 10) - self.assertEqual(params["count"], "true") - self.assertNotIn("columns", params) - - def test_build_data_params_filters_and_lists(self) -> None: - params = DatasetClient._build_data_params( - filters={"price__gte": 10, "tag__in": ["a", "b"], "active": True}, - columns=["id", "name"], - order_by=["-price", "name"], - limit=100, - offset=0, - count=False, - ) - self.assertEqual(params["columns"], "id,name") - self.assertEqual(params["order_by"], "-price,name") - self.assertEqual(params["count"], "false") - self.assertEqual(params["price__gte"], 10) - self.assertEqual(params["tag__in"], ["a", "b"]) - self.assertEqual(params["active"], "true") - - @patch.object(DatasetClient, "get") - def test_get_resource_data(self, mock_get: MagicMock) -> None: - mock_get.return_value = { - "columns": ["id"], - "rows": [[1]], - "total": 1, - "limit": 100, - "offset": 0, - } - result = self.client.get_resource_data( - "res-1", - filters={"id__gte": 1}, - columns=["id"], - order_by=["id"], - limit=10, - ) - self.assertEqual(result["total"], 1) - endpoint, kwargs = mock_get.call_args[0][0], mock_get.call_args.kwargs - self.assertEqual(endpoint, "/api/resources/res-1/data/") - self.assertEqual(kwargs["params"]["columns"], "id") - self.assertEqual(kwargs["params"]["id__gte"], 1) - - @patch.object(DatasetClient, "get") - def test_get_dataset_data_with_resource_id(self, mock_get: MagicMock) -> None: - mock_get.return_value = {"rows": [], "columns": [], "total": 0} - self.client.get_dataset_data("ds-1", resource_id="res-9", limit=5) - endpoint = mock_get.call_args[0][0] - params = mock_get.call_args.kwargs["params"] - self.assertEqual(endpoint, "/api/datasets/ds-1/data/") - self.assertEqual(params["resource_id"], "res-9") - self.assertEqual(params["limit"], 5) - - @patch.object(DatasetClient, "get") - def test_get_prompt_data_shorthands(self, mock_get: MagicMock) -> None: - mock_get.return_value = {"rows": [], "columns": [], "total": 0} - self.client.get_prompt_data( - "ds-1", - prompt_contains="translate", - response_contains="hello", - min_length=5, - max_length=100, - ) - endpoint = mock_get.call_args[0][0] - params = mock_get.call_args.kwargs["params"] - self.assertEqual(endpoint, "/api/datasets/ds-1/prompts/") - self.assertEqual(params["prompt_contains"], "translate") - self.assertEqual(params["response_contains"], "hello") - self.assertEqual(params["min_length"], 5) - self.assertEqual(params["max_length"], 100) - - @patch.object(DatasetClient, "get_resource_data") - def test_iter_resource_data_paginates(self, mock_get_data: MagicMock) -> None: - # Two pages: full batch then partial page (terminator) - mock_get_data.side_effect = [ - {"columns": ["id", "name"], "rows": [[1, "a"], [2, "b"]]}, - {"columns": ["id", "name"], "rows": [[3, "c"]]}, - ] - rows = list(self.client.iter_resource_data("res-1", batch_size=2)) - self.assertEqual( - rows, - [ - {"id": 1, "name": "a"}, - {"id": 2, "name": "b"}, - {"id": 3, "name": "c"}, - ], - ) - self.assertEqual(mock_get_data.call_count, 2) - # Second call advances offset - self.assertEqual(mock_get_data.call_args_list[1].kwargs["offset"], 2) - - @patch.object(DatasetClient, "get_resource_data") - def test_iter_resource_data_empty(self, mock_get_data: MagicMock) -> None: - mock_get_data.return_value = {"columns": ["id"], "rows": []} - self.assertEqual(list(self.client.iter_resource_data("res-1")), []) - - if __name__ == "__main__": unittest.main() diff --git a/tests/test_settings.py b/tests/test_settings.py index 751a62a9..2fee6f51 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -6,16 +6,16 @@ import sys # Add the project root directory to Python path -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, project_root) from DataSpace.settings import * # Use an in-memory SQLite database for testing DATABASES = { - "default": { - "ENGINE": "django.db.backends.sqlite3", - "NAME": ":memory:", + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': ':memory:', } } @@ -24,10 +24,9 @@ # Use a faster password hasher during tests PASSWORD_HASHERS = [ - "django.contrib.auth.hashers.MD5PasswordHasher", + 'django.contrib.auth.hashers.MD5PasswordHasher', ] - # Disable migrations during tests class DisableMigrations: def __contains__(self, item): @@ -36,43 +35,19 @@ def __contains__(self, item): def __getitem__(self, item): return None - MIGRATION_MODULES = DisableMigrations() # Disable celery tasks during tests CELERY_ALWAYS_EAGER = True CELERY_EAGER_PROPAGATES_EXCEPTIONS = True -# NOTE: We intentionally do NOT override INSTALLED_APPS — the real settings -# already define AUTH_USER_MODEL = "authorization.User", so the -# ``authorization`` app must be present for Django to bootstrap. Trimming the -# list to a "minimal" set previously broke every test with -# ``ImproperlyConfigured: AUTH_USER_MODEL refers to model 'authorization.User' -# that has not been installed``. Use ``MIGRATION_MODULES`` above to keep -# tests fast instead of stripping apps. - -# Drop middleware that requires live external services (Keycloak / rate -# limiter / activity stream) so unit tests can boot without network access. -MIDDLEWARE = [ - m - for m in MIDDLEWARE # noqa: F405 — imported via ``from DataSpace.settings import *`` - if m - not in { - "authorization.middleware.KeycloakAuthenticationMiddleware", - "authorization.middleware.activity_consent.ActivityConsentMiddleware", - "api.middleware.rate_limit.rate_limit_middleware", - "api.middleware.request_validator.RequestValidationMiddleware", - } +# Required apps for testing +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'api', ] - -# Elasticsearch is optional during unit tests — point the DSL at a dummy host -# so module import doesn't try to connect. -ELASTICSEARCH_DSL = { - "default": {"hosts": "localhost:9200"}, -} - -# Disable real Keycloak calls in tests. -KEYCLOAK_SERVER_URL = "http://localhost:8080" -KEYCLOAK_REALM = "test" -KEYCLOAK_CLIENT_ID = "test-client" -KEYCLOAK_CLIENT_SECRET = "test-secret"