Skip to content

Commit a651cf0

Browse files
committed
Adding guided regex decoder
1 parent 748e2ae commit a651cf0

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

olmocr/pipeline.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
236236
# Change temperature as number of attempts increases to overcome repetition issues at expense of quality
237237
query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt]
238238

239+
# Enable guided decoding regex if needed
240+
if args.guided_decoding:
241+
query["guided_regex"] = r"---\nprimary_language: .*\nis_rotation_valid: .*\nrotation_correction: .*\nis_table: .*\nis_diagram: .*\n---\n[\s\S]*"
242+
239243
logger.info(f"Built page query for {pdf_orig_path}-{page_num}")
240244

241245
try:
@@ -1022,6 +1026,7 @@ async def main():
10221026
parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
10231027
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1024)
10241028
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters)", default=3000)
1029+
parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs")
10251030

10261031
# Beaker/job running stuff
10271032
parser.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")

0 commit comments

Comments
 (0)