Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_0shot_instruct import nc_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_0_shot_instruct import pp_acc_datasets_0shot_instruct
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_0shot_instruct import pp_rmse_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_0shot_instruct import fts_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_0shot_instruct import meteor_0shot_instruct_datasets

smolinstruct_datasets_0shot_instruct = nc_0shot_instruct_datasets + pp_rmse_0shot_instruct_datasets + pp_acc_datasets_0shot_instruct + meteor_0shot_instruct_datasets + fts_0shot_instruct_datasets
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import FTSEvaluator
from opencompass.datasets import SmolInstructDataset

fts_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')

fts_hint_dict = {
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain the SMILES representation of the predicted product wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and all reactants and reagents should be enclosed **together** within a single pair of <SMILES> and </SMILES> tags, separated by ".". Your reply must be valid and chemically reasonable.""",
}

name_dict = {
'MG': 'molecule_generation',
'FS': 'forward_synthesis',
'RS': 'retrosynthesis'
}

fts_0shot_instruct_datasets = []
for _name in name_dict:
_hint = fts_hint_dict[_name]
fts_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

fts_0shot_eval_cfg = dict(
evaluator=dict(type=FTSEvaluator),
)

fts_0shot_instruct_datasets.append(
dict(
abbr=f'{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=fts_0shot_reader_cfg,
infer_cfg=fts_0shot_infer_cfg,
eval_cfg=fts_0shot_eval_cfg,
))

del _name
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import MeteorEvaluator
from opencompass.datasets import SmolInstructDataset

meteor_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')

meteor_hint_dict = {
'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
}

name_dict = {
'MC': 'molecule_captioning',
}

meteor_0shot_instruct_datasets = []
for _name in name_dict:
_hint = meteor_hint_dict[_name]
meteor_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

meteor_0shot_eval_cfg = dict(
evaluator=dict(type=MeteorEvaluator),
)

meteor_0shot_instruct_datasets.append(
dict(
abbr=f'{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=meteor_0shot_reader_cfg,
infer_cfg=meteor_0shot_infer_cfg,
eval_cfg=meteor_0shot_eval_cfg,
))

del _name
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
from opencompass.datasets import SmolInstructDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess

nc_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')

nc_hint_dict = {
'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
}

name_dict = {
'I2F': 'name_conversion-i2f',
'I2S': 'name_conversion-i2s',
'S2F': 'name_conversion-s2f',
'S2I': 'name_conversion-s2i',
}

nc_0shot_instruct_datasets = []
for _name in name_dict:
_hint = nc_hint_dict[_name]
nc_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
if _name in ['I2F', 'S2F']:
nc_0shot_eval_cfg = dict(
evaluator=dict(type=NCElementMatchEvaluator),
)
else:
nc_0shot_eval_cfg = dict(
evaluator=dict(type=NCExactMatchEvaluator),
)

nc_0shot_instruct_datasets.append(
dict(
abbr=f'NC-{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=nc_0shot_reader_cfg,
infer_cfg=nc_0shot_infer_cfg,
eval_cfg=nc_0shot_eval_cfg,
))

del _name
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from opencompass.openicl import AccEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SmolInstructDataset
from opencompass.datasets.smolinstruct import smolinstruct_acc_0shot_postprocess

pp_acc_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')

pp_acc_hint_dict = {
'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
The input contains the compound. Your reply should only contain Yes or No. Your reply must be valid and chemically reasonable.""",
'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
}

name_dict = {
'BBBP': 'property_prediction-bbbp',
'ClinTox': 'property_prediction-clintox',
'HIV': 'property_prediction-hiv',
'SIDER': 'property_prediction-sider',
}

pp_acc_datasets_0shot_instruct = []
for _name in pp_acc_hint_dict:
_hint = pp_acc_hint_dict[_name]

pp_acc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

pp_acc_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=smolinstruct_acc_0shot_postprocess)
)

pp_acc_datasets_0shot_instruct.append(
dict(
abbr=f'PP-{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=pp_acc_reader_cfg,
infer_cfg=pp_acc_infer_cfg,
eval_cfg=pp_acc_eval_cfg,
))

del _name, _hint
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import RMSEEvaluator
from opencompass.datasets import SmolInstructDataset

pp_rmse_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')

pp_rmse_hint_dict = {
'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable.""",
'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable."""
}

name_dict = {
'ESOL': 'property_prediction-esol',
'Lipo': 'property_prediction-lipo'
}

pp_rmse_0shot_instruct_datasets = []
for _name in name_dict:
_hint = pp_rmse_hint_dict[_name]
pp_rmse_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

pp_rmse_0shot_eval_cfg = dict(
evaluator=dict(type=RMSEEvaluator),
)

pp_rmse_0shot_instruct_datasets.append(
dict(
abbr=f'PP-{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=pp_rmse_0shot_reader_cfg,
infer_cfg=pp_rmse_0shot_infer_cfg,
eval_cfg=pp_rmse_0shot_eval_cfg,
))

del _name
Loading