Skip to content

Commit 07c1659

Browse files
fix:standardize_lang (#12)
* fix:standardize_lang * fix:standardize_lang * Update ocp_pipeline/opm.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * fix:standardize_lang * fix:standardize_lang * fix:standardize_lang --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent 22abb08 commit 07c1659

File tree

2 files changed

+47
-13
lines changed

2 files changed

+47
-13
lines changed

ocp_pipeline/opm.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@
1212
from ovos_bus_client.session import SessionManager
1313
from ovos_plugin_manager.ocp import available_extractors
1414
from ovos_plugin_manager.templates.pipeline import IntentMatch, PipelinePlugin
15+
from ovos_utils.lang import standardize_lang_tag, get_language_dir
1516
from ovos_utils.log import LOG
1617
from ovos_utils.messagebus import FakeBus
1718
from ovos_utils.ocp import MediaType, PlaybackType, PlaybackMode, PlayerState, OCP_ID, \
1819
MediaEntry, Playlist, MediaState, TrackState, dict2entry, PluginStream
1920
from ovos_workshop.app import OVOSAbstractApplication
2021
from padacioso import IntentContainer
21-
22+
from langcodes import closest_match
2223
from ocp_pipeline.feats import OCPFeaturizer
2324
from ocp_pipeline.legacy import LegacyCommonPlay
2425

@@ -102,16 +103,18 @@ def load_classifiers(self):
102103
def load_resource_files(self):
103104
intents = {}
104105
for lang in self.native_langs:
106+
lang = standardize_lang_tag(lang)
105107
intents[lang] = {}
106-
locale_folder = join(dirname(__file__), "locale", lang)
107-
for f in os.listdir(locale_folder):
108-
path = join(locale_folder, f)
109-
if f in self.intents:
110-
with open(path) as intent:
111-
samples = intent.read().split("\n")
112-
for idx, s in enumerate(samples):
113-
samples[idx] = s.replace("{{", "{").replace("}}", "}")
114-
intents[lang][f] = samples
108+
locale_folder = get_language_dir(join(dirname(__file__), "locale"), lang)
109+
if locale_folder is not None:
110+
for f in os.listdir(locale_folder):
111+
path = join(locale_folder, f)
112+
if f in self.intents:
113+
with open(path) as intent:
114+
samples = intent.read().split("\n")
115+
for idx, s in enumerate(samples):
116+
samples[idx] = s.replace("{{", "{").replace("}}", "}")
117+
intents[lang][f] = samples
115118
return intents
116119

117120
def register_ocp_api_events(self):
@@ -138,6 +141,7 @@ def register_ocp_intents(self):
138141
intent_files = self.load_resource_files()
139142

140143
for lang, intent_data in intent_files.items():
144+
lang = standardize_lang_tag(lang)
141145
self.intent_matchers[lang] = IntentContainer()
142146
for intent_name in self.intents:
143147
samples = intent_data.get(intent_name)
@@ -286,7 +290,8 @@ def handle_player_state_update(self, message: Message):
286290
def match_high(self, utterances: List[str], lang: str, message: Message = None) -> Optional[IntentMatch]:
287291
""" exact matches only, handles playback control
288292
recommended after high confidence intents pipeline stage """
289-
if lang not in self.intent_matchers:
293+
lang = self._get_closest_lang(lang)
294+
if lang is None: # no intents registered for this lang
290295
return None
291296

292297
self.bus.emit(Message("ovos.common_play.status")) # sync
@@ -327,6 +332,8 @@ def match_high(self, utterances: List[str], lang: str, message: Message = None)
327332
def match_medium(self, utterances: List[str], lang: str, message: Message = None) -> Optional[IntentMatch]:
328333
""" match a utterance via classifiers,
329334
recommended before common_qa pipeline stage"""
335+
lang = standardize_lang_tag(lang)
336+
330337
utterance = utterances[0].lower()
331338
# is this a OCP query ?
332339
is_ocp, bconf = self.is_ocp_query(utterance, lang)
@@ -368,6 +375,8 @@ def match_fallback(self, utterances: List[str], lang: str, message: Message = No
368375
if not ents:
369376
return None
370377

378+
lang = standardize_lang_tag(lang)
379+
371380
# classify the query media type
372381
media_type, confidence = self.classify_media(utterance, lang)
373382

@@ -388,7 +397,7 @@ def match_fallback(self, utterances: List[str], lang: str, message: Message = No
388397

389398
def _process_play_query(self, utterance: str, lang: str, match: dict = None,
390399
message: Optional[Message] = None) -> Optional[IntentMatch]:
391-
400+
lang = standardize_lang_tag(lang)
392401
match = match or {}
393402
player = self.get_player(message)
394403
# if media is currently paused, empty string means "resume playback"
@@ -455,6 +464,7 @@ def handle_search_query(self, message: Message):
455464
if num:
456465
phrase += " " + num
457466

467+
lang = standardize_lang_tag(lang)
458468
# classify the query media type
459469
media_type, prob = self.classify_media(utterance, lang)
460470
# search common play skills
@@ -503,6 +513,7 @@ def handle_play_intent(self, message: Message):
503513
skills = message.data.get("skills", [])
504514

505515
# search common play skills
516+
lang = standardize_lang_tag(lang)
506517
results = self._search(query, media_type, lang,
507518
skills=skills, message=message)
508519

@@ -613,6 +624,7 @@ def handle_search_error_intent(self, message: Message):
613624

614625
# NLP
615626
def voc_match_media(self, query: str, lang: str) -> Tuple[MediaType, float]:
627+
lang = standardize_lang_tag(lang)
616628
# simplistic approach via voc_match, works anywhere
617629
# and it's easy to localize, but isn't very accurate
618630
if self.voc_match(query, "MusicKeyword", lang=lang):
@@ -674,6 +686,7 @@ def voc_match_media(self, query: str, lang: str) -> Tuple[MediaType, float]:
674686

675687
def classify_media(self, query: str, lang: str) -> Tuple[MediaType, float]:
676688
""" determine what media type is being requested """
689+
lang = standardize_lang_tag(lang)
677690
# using a trained classifier (Experimental)
678691
if self.config.get("experimental_media_classifier", False):
679692
from ovos_classifiers.skovos.classifier import SklearnOVOSClassifier
@@ -701,6 +714,7 @@ def classify_media(self, query: str, lang: str) -> Tuple[MediaType, float]:
701714

702715
def is_ocp_query(self, query: str, lang: str) -> Tuple[bool, float]:
703716
""" determine if a playback question is being asked"""
717+
lang = standardize_lang_tag(lang)
704718
if self.config.get("experimental_binary_classifier", False):
705719
from ovos_classifiers.skovos.classifier import SklearnOVOSClassifier
706720
try:
@@ -731,6 +745,7 @@ def _should_resume(self, phrase: str, lang: str, message: Optional[Message] = No
731745
@param phrase: Extracted playback phrase
732746
@return: True if player should resume, False if this is a new request
733747
"""
748+
lang = standardize_lang_tag(lang)
734749
player = self.get_player(message)
735750
if player.player_state == PlayerState.PAUSED:
736751
if not phrase.strip() or \
@@ -782,6 +797,7 @@ def normalize_results(self, results: list) -> List[Union[MediaEntry, Playlist, P
782797
def filter_results(self, results: list, phrase: str, lang: str,
783798
media_type: MediaType = MediaType.GENERIC,
784799
message: Optional[Message] = None) -> list:
800+
lang = standardize_lang_tag(lang)
785801
# ignore very low score matches
786802
l1 = len(results)
787803
results = [r for r in results
@@ -1031,6 +1047,10 @@ def match_legacy(self, utterances: List[str], lang: str, message: Message = None
10311047

10321048
utterance = utterances[0].lower()
10331049

1050+
lang = self._get_closest_lang(lang)
1051+
if lang is None: # no intents registered for this lang
1052+
return None
1053+
10341054
match = self.intent_matchers[lang].calc_intent(utterance)
10351055

10361056
if match["name"] is None:
@@ -1045,6 +1065,18 @@ def match_legacy(self, utterances: List[str], lang: str, message: Message = None
10451065
skill_id=OCP_ID,
10461066
utterance=utterance)
10471067

1068+
def _get_closest_lang(self, lang: str) -> Optional[str]:
1069+
if self.intent_matchers:
1070+
lang = standardize_lang_tag(lang)
1071+
closest, score = closest_match(lang, list(self.intent_matchers.keys()))
1072+
# https://langcodes-hickford.readthedocs.io/en/sphinx/index.html#distance-values
1073+
# 0 -> These codes represent the same language, possibly after filling in values and normalizing.
1074+
# 1- 3 -> These codes indicate a minor regional difference.
1075+
# 4 - 10 -> These codes indicate a significant but unproblematic regional difference.
1076+
if score < 10:
1077+
return closest
1078+
return None
1079+
10481080
def handle_legacy_cps(self, message: Message):
10491081
"""intent handler for legacy CPS matches"""
10501082
utt = message.data["query"]

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
ovos-workshop>=0.1.7,<2.0.0
2-
ovos-classifiers
2+
ovos-classifiers
3+
ovos-utils>=0.3.5,<1.0.0
4+
langcodes

0 commit comments

Comments
 (0)