Skip to content

Commit dfaa500

Browse files
authored
Add multitrial scripts and instructions (#1043)
* Add multitrial scripts * Add multitrial scripts and README * Linter fixes * Nitpick * Add leftover JSON
1 parent a95888a commit dfaa500

File tree

5 files changed

+446
-0
lines changed

5 files changed

+446
-0
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Multitrial systematics with MLHEP
2+
3+
## Generate configurations (MLHEP yml databases) for each trial
4+
5+
File: `run-mlhep-fitter-multitrial.py`<br>
6+
Usage: `python run-mlhep-fitter-multitrial.py database_file in_db_dir out_db_dir mlhep_results_dir_pattern`
7+
8+
Arguments:
9+
- `database_file`: filename of the template database without the .yml extension, e.g., `database_ml_parameters_LcToPKPi`
10+
- `in_db_dir`: path to the directory containing the database, e.g., `data/data_run3`
11+
- `out_db_dir`: path to the directory for output multitrial databases, e.g., `multitrial_db`
12+
- `mlhep_results_dir_pattern`: prefix of output directory name for fit results; for each trial, the trial name is appended to the directory name, and the resulting directory name is written under `Run3analysis/{data,mc}/prefix_dir_res` in the database file
13+
14+
Adjust `DIR_PATH` in the script. It is the path to the base directory where you store directories with MLHEP results.
15+
16+
This script needs to be ran only once to generate databases.
17+
18+
Currently, the trials are hardcoded in the Python script. To add or modify a trial, you need to adjust `BASE_TRIALS` variable and the `process_trial` function.
19+
20+
## Get mass fits for each trial
21+
22+
File: `run-mlhep-fitter-multitrial.sh`<br>
23+
Usage: `./run-mlhep-fitter-multitrial.sh`
24+
25+
The `submission/analyzer.yml` config is used.
26+
The script automates running MLHEP for each trial. Mass histograms are copied before each MLHEP invocation, so as only the quick fit steps needs to be activated in `submission/analyzer.yml`
27+
28+
Adjust the variables before the `for` loop.<br>
29+
The script includes also a call to `run-mlhep-fitter-multitrial.py`, which can be commented out. In this case, make sure to pass the same `OUT_DB_DIR`, `DB_PATTERN`, `DIR_PATTERN` values to the two scripts.
30+
31+
Before running, you need to create the directory structure for each MLHEP output. You can, for example, run the `.sh` script with the `cp` lines commented out. Then, MLHEP creates directories for each trial and fails quietly. Next, run the script with `cp` lines uncommented, and you will get the final output.
32+
33+
## Plot multitrial results
34+
35+
Files: `multitrial.py`, `config_multitrial.json`<br>
36+
Usage: `python3 multitrial.py config_multitrial.json`
37+
38+
Adjust the sample `config_multitrial.json` to your needs.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"file_pattern": "/data8/majak/MLHEP/results-24022025-newtrain-multitrial-prompt*/LHC23pp_pass4/Results/resultsdatatot/yields_LcpKpi_Run3analysis.root",
3+
"_file_pattern": "regex pattern for all multitrial fit files; note the asterisk to match all trial suffixes",
4+
"dir_pattern": "results-24022025-newtrain-multitrial-prompt",
5+
"_dir_pattern": "the base directory prefix from the file pattern above",
6+
"histoname": "hyields0",
7+
"_histoname": "histogram with mass fit",
8+
"sel_histoname": "hchi0",
9+
"_sel_histoname": "histogram for filtering the results",
10+
"selection": "lambda x : x < 5.0",
11+
"_selection": "filter to apply with sel_histoname, e.g., chi < 5",
12+
"pt_bins_min": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16],
13+
"pt_bins_max": [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24],
14+
"central_trial": "",
15+
"_central_trial": "suffix of the directory with the central trial",
16+
"outdir": "/data8/majak/multitrial",
17+
"_outdir": "output directory",
18+
"outfile": "result-prompt-chi5",
19+
"_outfile": "output file pattern"
20+
}
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# pylint: disable=missing-function-docstring, invalid-name
2+
"""
3+
file: multitrial.py
4+
brief: Plot multitrial systematics based on multiple fit trials, one file per trial.
5+
usage: python3 multitrial.py config_multitrial.json
6+
author: Maja Karwowska <[email protected]>, Warsaw University of Technology
7+
"""
8+
import argparse
9+
import glob
10+
import json
11+
import re
12+
import numpy as np
13+
import matplotlib.pyplot as plt
14+
from matplotlib.ticker import MultipleLocator, AutoMinorLocator
15+
16+
from ROOT import ( # pylint: disable=import-error,no-name-in-module
17+
TFile,
18+
gROOT,
19+
)
20+
21+
22+
def plot_text_box(ax, text):
23+
ax.text(0.98, 0.97, text,
24+
horizontalalignment="right", verticalalignment="top",
25+
fontsize=40, va="top", transform=ax.transAxes,
26+
bbox={"edgecolor": "black", "fill": False})
27+
28+
29+
def get_yields(cfg):
30+
filenames = sorted(glob.glob(cfg["file_pattern"]),
31+
key=lambda filename: re.split("/", filename)[-2])
32+
yields = {}
33+
yields_err = {}
34+
trials = {}
35+
chis = {}
36+
for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]):
37+
yields[f"{pt_bin_min}_{pt_bin_max}"] = []
38+
yields_err[f"{pt_bin_min}_{pt_bin_max}"] = []
39+
trials[f"{pt_bin_min}_{pt_bin_max}"] = []
40+
chis[f"{pt_bin_min}_{pt_bin_max}"] = []
41+
for filename in filenames:
42+
print(f"Reading {filename}")
43+
with TFile.Open(filename) as fin:
44+
hist = fin.Get(cfg["histoname"])
45+
hist_sel = fin.Get(cfg["sel_histoname"])
46+
if hist.ClassName() != "TH1F":
47+
print(f"No hist in {filename}")
48+
if hist_sel.ClassName() != "TH1F":
49+
print(f"No hist sel in {filename}")
50+
dirname = re.split("/", filename)[4] # [-2] for D2H fitter
51+
trial_name = dirname.replace(cfg["dir_pattern"], "")
52+
for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"],
53+
cfg["pt_bins_max"])):
54+
if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \
55+
and hist.GetBinContent(ind + 1) > 1.0 : # pylint: disable=eval-used
56+
yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1))
57+
yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1))
58+
trials[f"{pt_bin_min}_{pt_bin_max}"].append(trial_name)
59+
chis[f"{pt_bin_min}_{pt_bin_max}"].append(hist_sel.GetBinContent(ind + 1))
60+
else:
61+
print(f"Rejected: {hist_sel.GetBinContent(ind + 1)} {trial_name} "\
62+
f"pt: {pt_bin_min}, {pt_bin_max}")
63+
if hist.GetBinContent(ind + 1) < 1.0:
64+
print("Yield 0")
65+
return yields, yields_err, trials, chis
66+
67+
68+
def prepare_figure(y_label, ticks):
69+
fig = plt.figure(figsize=(20, 15))
70+
ax = plt.subplot(1, 1, 1)
71+
ax.set_xlabel("Trial #", fontsize=20)
72+
ax.set_ylabel(y_label, fontsize=20)
73+
ax.tick_params(which="both", width=2.5, direction="in")
74+
ax.tick_params(which="major", labelsize=20, length=15)
75+
ax.tick_params(which="minor", length=7)
76+
ax.xaxis.set_major_locator(MultipleLocator(ticks))
77+
ax.xaxis.set_minor_locator(AutoMinorLocator(5))
78+
ax.yaxis.set_minor_locator(AutoMinorLocator(5))
79+
return fig, ax
80+
81+
82+
def set_ax_limits(ax, pt_string, values):
83+
ax.margins(0.01, 0.2)
84+
np_values = np.array(values, dtype="float32")
85+
if ax.get_ylim()[1] - ax.get_ylim()[0] > 30.0 * np.std(np_values):
86+
ax.set_ylim(np.mean(np_values) - 10.0 * np.std(np_values),
87+
np.mean(np_values) + 10.0 * np.std(np_values))
88+
print(f"{pt_string} narrowing down the axis to {ax.get_ylim()}")
89+
90+
91+
def plot_trial_line(ax, central_trial_ind):
92+
axis_lim = ax.get_ylim()
93+
y_axis = np.linspace(*axis_lim, 100)
94+
ax.plot([central_trial_ind] * len(y_axis), y_axis, c="m", ls="--", linewidth=4.0)
95+
ax.set_ylim(*axis_lim)
96+
97+
98+
def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string,
99+
central_trial_ind, central_yield):
100+
fig, ax = prepare_figure("Raw yield", 100)
101+
x_axis = range(len(trials))
102+
ax.errorbar(x_axis, yields, yerr=yields_err,
103+
fmt="o", c="b", elinewidth=2.5, linewidth=4.0)
104+
set_ax_limits(ax, pt_string, yields)
105+
central_line = np.array([central_yield] * len(x_axis), dtype="float32")
106+
ax.plot(x_axis, central_line, c="orange", ls="--", linewidth=4.0)
107+
central_err = np.array([yields_err[central_trial_ind]] * len(x_axis), dtype="float32")
108+
ax.fill_between(x_axis, central_line - central_err, central_line + central_err,
109+
facecolor="orange", edgecolor="none", alpha=0.3)
110+
plot_trial_line(ax, central_trial_ind)
111+
plot_text_box(ax, plot_pt_string)
112+
fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_yields_trials_{pt_string}.png',
113+
bbox_inches='tight')
114+
plt.close()
115+
116+
117+
def plot_chis(chis, cfg, pt_string, plot_pt_string):
118+
fig, ax = prepare_figure("Chi2/ndf", 100)
119+
x_axis = range(len(chis))
120+
ax.scatter(x_axis, chis, c="b", marker="o")
121+
set_ax_limits(ax, pt_string, chis)
122+
plot_text_box(ax, plot_pt_string)
123+
fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_chis_{pt_string}.png',
124+
bbox_inches='tight')
125+
plt.close()
126+
127+
128+
def plot_yields_distr(yields, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield):
129+
plt.figure(figsize=(20, 15))
130+
ax = plt.subplot(1, 1, 1)
131+
ax.set_xlabel("Ratio", fontsize=20)
132+
ax.tick_params(labelsize=20, length=7, width=2.5)
133+
ratios = [yield_ / central_yield for ind, yield_ in enumerate(yields) \
134+
if ind != central_trial_ind]
135+
ax.hist(ratios, color="b", linewidth=4.0)
136+
mean = np.mean(yields)
137+
std_dev = np.std(yields)
138+
diffs = [(yield_ - central_yield) / central_yield \
139+
for yield_ in yields[:central_trial_ind]]
140+
diffs.extend([(yield_ - central_yield) / central_yield \
141+
for yield_ in yields[central_trial_ind+1:]])
142+
rmse = np.sqrt(np.mean(np.array(diffs, dtype="float32")**2))
143+
plot_text_box(ax, f"{plot_pt_string}\n"\
144+
f"mean: {mean:.0f}\n"\
145+
f"std dev: {std_dev:.2f}\n"\
146+
f"RMSE: {rmse:.2f}\n"\
147+
f"#trials: {len(yields)}")
148+
plt.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_distr_{pt_string}.png', bbox_inches='tight')
149+
plt.close()
150+
151+
152+
def main():
153+
gROOT.SetBatch(True)
154+
155+
parser = argparse.ArgumentParser(description="Arguments to pass")
156+
parser.add_argument("config", help="JSON config file")
157+
args = parser.parse_args()
158+
159+
with open(args.config, encoding="utf8") as fil:
160+
cfg = json.load(fil)
161+
162+
yields, yields_err, trials, chis = get_yields(cfg)
163+
164+
for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]):
165+
plot_pt_string = f"${pt_bin_min} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {pt_bin_max}$"
166+
pt_string = f"{pt_bin_min}_{pt_bin_max}"
167+
168+
try:
169+
central_trial_ind = trials[pt_string].index(cfg["central_trial"])
170+
central_yield = yields[pt_string][central_trial_ind]
171+
172+
plot_yields_trials(yields[pt_string], yields_err[pt_string], trials[pt_string], cfg,
173+
pt_string, plot_pt_string, central_trial_ind, central_yield)
174+
plot_yields_distr(yields[pt_string], cfg, pt_string, plot_pt_string,
175+
central_trial_ind, central_yield)
176+
plot_chis(chis[pt_string], cfg, pt_string, plot_pt_string)
177+
except: # pylint: disable=bare-except
178+
pass
179+
180+
with open(f'{cfg["outdir"]}/{cfg["outfile"]}_trials_{pt_string}.txt',
181+
"w", encoding="utf-8") as ftext:
182+
for trial in trials[pt_string]:
183+
ftext.write(f"{trial}\n")
184+
185+
186+
if __name__ == "__main__":
187+
main()

0 commit comments

Comments
 (0)