6
6
from typing import Any , Collection , Dict , List , Optional , Set , Type , Union
7
7
8
8
from sqlalchemy import inspect
9
- from sqlalchemy .orm import Session
9
+ from sqlalchemy .orm import Session , scoped_session , sessionmaker
10
10
11
11
from fonduer .meta import Meta , new_sessionmaker
12
12
from fonduer .parser .models .document import Document
@@ -94,7 +94,7 @@ def _after_apply(self, **kwargs: Any) -> None:
94
94
"""Execute this method by a single process after apply."""
95
95
pass
96
96
97
- def _add (self , instance : Any ) -> None :
97
+ def _add (self , session : Session , instance : Any ) -> None :
98
98
pass
99
99
100
100
def _apply (
@@ -114,9 +114,13 @@ def _apply(
114
114
# Clear the last documents parsed by the last run
115
115
self .last_docs = set ()
116
116
117
+ # Create DB session factory for insert data on each UDF (#545)
118
+ session_factory = new_sessionmaker ()
117
119
# Create UDF Processes
118
120
for i in range (parallelism ):
119
121
udf = self .udf_class (
122
+ session_factory = session_factory ,
123
+ runner = self ,
120
124
in_queue = in_queue ,
121
125
out_queue = out_queue ,
122
126
worker_id = i ,
@@ -164,8 +168,6 @@ def in_thread_func() -> None:
164
168
# Flush the processes
165
169
self .udfs = []
166
170
167
- self .session .commit ()
168
-
169
171
170
172
class UDF (Process ):
171
173
"""UDF class."""
@@ -174,6 +176,8 @@ class UDF(Process):
174
176
175
177
def __init__ (
176
178
self ,
179
+ session_factory : sessionmaker = None ,
180
+ runner : UDFRunner = None ,
177
181
in_queue : Optional [Queue ] = None ,
178
182
out_queue : Optional [Queue ] = None ,
179
183
worker_id : int = 0 ,
@@ -187,6 +191,8 @@ def __init__(
187
191
"""
188
192
super ().__init__ ()
189
193
self .daemon = True
194
+ self .session_factory = session_factory
195
+ self .runner = runner
190
196
self .in_queue = in_queue
191
197
self .out_queue = out_queue
192
198
self .worker_id = worker_id
@@ -201,9 +207,9 @@ def run(self) -> None:
201
207
multiprocess setting The basic routine is: get from JoinableQueue,
202
208
apply, put / add outputs, loop
203
209
"""
204
- # Each UDF starts its own Engine
205
- # See SQLalchemy, using connection pools with multiprocessing.
206
- Session = new_sessionmaker ( )
210
+ # Each UDF get thread local (scoped) session from connection pools
211
+ # See SQLalchemy, using scoped sesion with multiprocessing.
212
+ Session = scoped_session ( self . session_factory )
207
213
session = Session ()
208
214
while True :
209
215
doc = self .in_queue .get () # block until an item is available
@@ -214,12 +220,11 @@ def run(self) -> None:
214
220
if not inspect (doc ).transient :
215
221
doc = session .merge (doc , load = False )
216
222
y = self .apply (doc , ** self .apply_kwargs )
217
- if y :
218
- session .add (y )
219
- session .commit ()
223
+ self .runner ._add (session , y )
220
224
self .out_queue .put (doc .name )
221
225
session .commit ()
222
226
session .close ()
227
+ Session .remove ()
223
228
224
229
def apply (
225
230
self , doc : Document , ** kwargs : Any
0 commit comments