open-compass · MaiziXiao · May 29, 2025 · May 26, 2025 · May 27, 2025 · May 27, 2025
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py
@@ -0,0 +1,10 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_0shot_instruct import nc_0shot_instruct_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_0_shot_instruct import pp_acc_datasets_0shot_instruct
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_0shot_instruct import pp_rmse_0shot_instruct_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_0shot_instruct import fts_0shot_instruct_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_0shot_instruct import meteor_0shot_instruct_datasets
+
+smolinstruct_datasets_0shot_instruct = nc_0shot_instruct_datasets + pp_rmse_0shot_instruct_datasets + pp_acc_datasets_0shot_instruct + meteor_0shot_instruct_datasets + fts_0shot_instruct_datasets
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py
@@ -0,0 +1,55 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import FTSEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+fts_0shot_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+fts_hint_dict = {
+    'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
+    The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
+    'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
+    The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain the SMILES representation of the predicted product wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
+    'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
+    The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and all reactants and reagents should be enclosed **together** within a single pair of <SMILES> and </SMILES> tags, separated by ".". Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'MG': 'molecule_generation',
+    'FS': 'forward_synthesis',
+    'RS': 'retrosynthesis'
+}
+
+fts_0shot_instruct_datasets = []
+for _name in name_dict:
+    _hint = fts_hint_dict[_name]
+    fts_0shot_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
+            # template=f'<s>[INST] {{input}} [/INST]',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    fts_0shot_eval_cfg = dict(
+        evaluator=dict(type=FTSEvaluator),
+    )
+
+    fts_0shot_instruct_datasets.append(
+        dict(
+            abbr=f'{_name}-0shot-instruct',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=fts_0shot_reader_cfg,
+            infer_cfg=fts_0shot_infer_cfg,
+            eval_cfg=fts_0shot_eval_cfg,
+        ))
+
+del _name
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py
@@ -0,0 +1,49 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import MeteorEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+meteor_0shot_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+meteor_hint_dict = {
+    'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
+    The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'MC': 'molecule_captioning',
+}
+
+meteor_0shot_instruct_datasets = []
+for _name in name_dict:
+    _hint = meteor_hint_dict[_name]
+    meteor_0shot_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
+            # template=f'<s>[INST] {{input}} [/INST]',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    meteor_0shot_eval_cfg = dict(
+        evaluator=dict(type=MeteorEvaluator),
+    )
+
+    meteor_0shot_instruct_datasets.append(
+        dict(
+            abbr=f'{_name}-0shot-instruct',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=meteor_0shot_reader_cfg,
+            infer_cfg=meteor_0shot_infer_cfg,
+            eval_cfg=meteor_0shot_eval_cfg,
+        ))
+
+del _name
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py
@@ -0,0 +1,63 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
+from opencompass.datasets import SmolInstructDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+nc_0shot_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+nc_hint_dict = {
+    'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
+    The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
+    The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'I2F': 'name_conversion-i2f',
+    'I2S': 'name_conversion-i2s',
+    'S2F': 'name_conversion-s2f',
+    'S2I': 'name_conversion-s2i',
+}
+
+nc_0shot_instruct_datasets = []
+for _name in name_dict:
+    _hint = nc_hint_dict[_name]
+    nc_0shot_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
+            # template=f'<s>[INST] {{input}} [/INST]',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    if _name in ['I2F', 'S2F']:
+        nc_0shot_eval_cfg = dict(
+            evaluator=dict(type=NCElementMatchEvaluator),
+        )
+    else:
+        nc_0shot_eval_cfg = dict(
+            evaluator=dict(type=NCExactMatchEvaluator),
+        )
+
+    nc_0shot_instruct_datasets.append(
+        dict(
+            abbr=f'NC-{_name}-0shot-instruct',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=nc_0shot_reader_cfg,
+            infer_cfg=nc_0shot_infer_cfg,
+            eval_cfg=nc_0shot_eval_cfg,
+        ))
+
+del _name
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py
@@ -0,0 +1,61 @@
+from opencompass.openicl import AccEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SmolInstructDataset
+from opencompass.datasets.smolinstruct import smolinstruct_acc_0shot_postprocess
+
+pp_acc_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+pp_acc_hint_dict = {
+    'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
+    The input contains the compound. Your reply should only contain Yes or No. Your reply must be valid and chemically reasonable.""",
+    'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+    'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+    'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'BBBP': 'property_prediction-bbbp',
+    'ClinTox': 'property_prediction-clintox',
+    'HIV': 'property_prediction-hiv',
+    'SIDER': 'property_prediction-sider',
+}
+
+pp_acc_datasets_0shot_instruct = []
+for _name in pp_acc_hint_dict:
+    _hint = pp_acc_hint_dict[_name]
+
+    pp_acc_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
+            # template=f'<s>[INST] {{input}} [/INST]',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    pp_acc_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=smolinstruct_acc_0shot_postprocess)
+    )
+
+    pp_acc_datasets_0shot_instruct.append(
+        dict(
+            abbr=f'PP-{_name}-0shot-instruct',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=pp_acc_reader_cfg,
+            infer_cfg=pp_acc_infer_cfg,
+            eval_cfg=pp_acc_eval_cfg,
+        ))
+
+del _name, _hint
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py
@@ -0,0 +1,52 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import RMSEEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+pp_rmse_0shot_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+pp_rmse_hint_dict = {
+    'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable.""",
+    'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable."""
+}
+
+name_dict = {
+    'ESOL': 'property_prediction-esol',
+    'Lipo': 'property_prediction-lipo'
+}
+
+pp_rmse_0shot_instruct_datasets = []
+for _name in name_dict:
+    _hint = pp_rmse_hint_dict[_name]
+    pp_rmse_0shot_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
+            # template=f'<s>[INST] {{input}} [/INST]',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    pp_rmse_0shot_eval_cfg = dict(
+        evaluator=dict(type=RMSEEvaluator),
+    )
+
+    pp_rmse_0shot_instruct_datasets.append(
+        dict(
+            abbr=f'PP-{_name}-0shot-instruct',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=pp_rmse_0shot_reader_cfg,
+            infer_cfg=pp_rmse_0shot_infer_cfg,
+            eval_cfg=pp_rmse_0shot_eval_cfg,
+        ))
+
+del _name