diff --git a/atlasbot/engine/answerer.py b/atlasbot/engine/answerer.py index 5a3b29d..323b36d 100644 --- a/atlasbot/engine/answerer.py +++ b/atlasbot/engine/answerer.py @@ -199,6 +199,7 @@ class AnswerEngine: keywords = normalize.get("keywords") or [] _debug_log("normalize_parsed", {"normalized": normalized, "keywords": keywords}) keyword_tokens = _extract_keywords(question, normalized, sub_questions=[], keywords=keywords) + question_tokens = _extract_question_tokens(normalized) if observer: observer("route", "routing") @@ -328,7 +329,7 @@ class AnswerEngine: ) if isinstance(signals, list): signal_tokens = [str(item) for item in signals if item] - all_tokens = _merge_tokens(signal_tokens, keyword_tokens) + all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens) if observer: observer("retrieve", "scanning chunks") candidate_lines: list[str] = [] @@ -1502,9 +1503,9 @@ def _has_keyword_overlap(lines: list[str], keywords: list[str]) -> bool: return False -def _merge_tokens(primary: list[str], secondary: list[str]) -> list[str]: +def _merge_tokens(primary: list[str], secondary: list[str], third: list[str] | None = None) -> list[str]: merged: list[str] = [] - for token in primary + secondary: + for token in primary + secondary + (third or []): if not token: continue if token not in merged: @@ -1512,6 +1513,18 @@ def _merge_tokens(primary: list[str], secondary: list[str]) -> list[str]: return merged +def _extract_question_tokens(question: str) -> list[str]: + if not question: + return [] + tokens: list[str] = [] + for part in re.split(r"[^a-zA-Z0-9_-]+", question.lower()): + if len(part) < TOKEN_MIN_LEN: + continue + if part not in tokens: + tokens.append(part) + return tokens + + def _expand_tokens(tokens: list[str]) -> list[str]: if not tokens: return []