diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 24b77da1499..cb113e11bb3 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,7 @@ from __future__ import annotations +import re from typing import Optional from google.genai import types as genai_types @@ -92,6 +93,34 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +class _UnicodeTokenizer: + """Tokenizer that handles Unicode text with word-boundary awareness. + + The default RougeScorer tokenizer strips characters outside ``[a-z0-9]``, so + text in scripts without Latin word boundaries (Chinese, Japanese, Thai, etc.) + produces zero tokens and scores 0.0 even on an exact match. + + ASCII-majority text is delegated to rouge-score's ``DefaultTokenizer`` so the + existing behavior -- including Porter stemming -- is preserved exactly. For + non-ASCII text, Latin/digit runs are kept as words and each remaining word + character (e.g. a CJK ideograph) becomes its own token, so partial overlap is + scored instead of collapsing into a single opaque token. + """ + + def __init__(self, use_stemmer: bool = True): + self._default = rouge_scorer.tokenizers.DefaultTokenizer(use_stemmer) + + def tokenize(self, text: str) -> list[str]: + """Tokenizes text using Unicode-aware word boundaries.""" + text = text.lower() + if not text: + return [] + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return self._default.tokenize(text) + return re.findall(r"[a-z0-9]+|\w", text, re.UNICODE) + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -110,7 +139,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer( + ["rouge1"], + use_stemmer=True, + tokenizer=_UnicodeTokenizer(), + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure.