Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions api/types/type_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from api.types.type_preview_data import PreviewData
from api.types.type_prompt_resource_details import TypePromptResourceDetails
from api.types.type_resource_metadata import TypeResourceMetadata
from api.utils.data_indexing import get_preview_data, get_row_count
from api.utils.data_indexing import (
FILE_COUNT_FORMATS,
INDEXED_FORMATS,
get_entry_count_from_file,
get_preview_data,
get_row_count,
)
from api.utils.graphql_telemetry import trace_resolver

logger = structlog.get_logger(__name__)
Expand Down Expand Up @@ -177,14 +183,26 @@ def no_of_entries(self) -> int:
if not file_details:
return 0

if not hasattr(file_details, "format") or file_details.format.lower() != "csv":
if not hasattr(file_details, "format"):
return 0

try:
return get_row_count(self) # type: ignore
except Exception as row_count_error:
logger.error(f"Error in get_row_count: {str(row_count_error)}")
return 0
fmt = file_details.format.lower()

if fmt in INDEXED_FORMATS:
try:
return get_row_count(self) # type: ignore
except Exception as row_count_error:
logger.error(f"Error in get_row_count: {str(row_count_error)}")
return 0

if fmt in FILE_COUNT_FORMATS:
try:
return get_entry_count_from_file(self) # type: ignore
except Exception as file_count_error:
logger.error(f"Error in get_entry_count_from_file: {str(file_count_error)}")
return 0

return 0
except Exception as e:
logger.error(f"Error getting number of entries: {str(e)}")
return 0
Expand Down
41 changes: 41 additions & 0 deletions api/utils/data_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
# Use a separate database for data tables
DATA_DB = "data_db" # This should match the connection name in settings.py

# Formats indexed into ResourceDataTable (queryable via get_row_count)
INDEXED_FORMATS = {"csv", "xls", "xlsx", "ods", "parquet", "feather", "json", "tsv"}
# Formats counted by parsing the file directly (not indexed into DB)
FILE_COUNT_FORMATS = {"yml", "yaml", "xml"}

# Allowed comparison operators for column-based filtering on indexed data.
# Maps operator suffix -> (sql_template_with_{ph}_placeholder, value_transformer)
_FILTER_OPERATORS: Dict[str, Tuple[str, Any]] = {
Expand Down Expand Up @@ -397,6 +402,42 @@ def get_row_count(resource: Resource) -> int:
return 0


def get_entry_count_from_file(resource: Resource) -> int:
"""Count entries in yml/yaml/xml files by parsing the file directly."""
try:
file_details = getattr(resource, "resourcefiledetails", None)
if not file_details:
return 0

fmt = file_details.format.lower()
filepath = file_details.file.path

if fmt in ("yml", "yaml"):
import yaml

with open(filepath, "r") as f:
data = yaml.safe_load(f)
if isinstance(data, list):
return len(data)
if isinstance(data, dict):
for v in data.values():
if isinstance(v, list):
return len(v)
return len(data)
return 0

if fmt == "xml":
import xml.etree.ElementTree as ET

root = ET.parse(filepath).getroot()
return len(list(root))

return 0
except Exception as e:
logger.error(f"Error counting entries from file for resource {resource.id}: {str(e)}")
return 0


def get_preview_data(resource: Resource) -> Optional[PreviewData]:
try:
if not resource.preview_enabled:
Expand Down
Loading