@@ -182,6 +182,10 @@ class BrokenCrawlerQueue(Exception):
182
182
# because its #.get method can block, but will raise Empty when drained.
183
183
# This is fine with `quenouille` if we don't forget to call #.unblock
184
184
# on panic.
185
+ # NOTE: sqlite needs to serialize writes, which means it is not particularly
186
+ # useful in our case to allow for concurrent access to the queue because most
187
+ # of its operations need to update the database somehow. This also means we
188
+ # can rely on a single lock to make multithreaded transactions safe.
185
189
class CrawlerQueue :
186
190
# Params
187
191
persistent : bool
@@ -192,15 +196,13 @@ class CrawlerQueue:
192
196
193
197
# State
194
198
tasks : Dict [str , int ]
195
- put_connection : sqlite3 .Connection
196
- task_connection : sqlite3 .Connection
199
+ connection : sqlite3 .Connection
197
200
counter : int
198
201
cleanup_interval : int
199
202
vacuum_interval : int
200
203
waiter : Condition
201
204
currently_waiting_count : int
202
- put_lock : Lock
203
- task_lock : Lock
205
+ transaction_lock : Lock
204
206
broken : bool
205
207
206
208
def __init__ (
@@ -257,29 +259,22 @@ def __init__(
257
259
258
260
self .waiter = Condition ()
259
261
self .currently_waiting_count = 0
260
- self .put_lock = Lock ()
261
- self .task_lock = Lock ()
262
+ self .transaction_lock = Lock ()
262
263
self .broken = False
263
264
264
- # NOTE: we need two connection if we are to allow concurrent
265
- # put and task acquisition.
266
- self .put_connection = sqlite3 .connect (full_path , check_same_thread = False )
267
- self .task_connection = sqlite3 .connect (full_path , check_same_thread = False )
265
+ self .connection = sqlite3 .connect (full_path , check_same_thread = False )
268
266
269
267
# NOTE: it's seems it is safer and common practice to
270
268
# reexecute pragmas each time because they might not
271
269
# be stored persistently in some instances.
272
- self .put_connection .executescript (SQL_PRAGMAS )
273
- self .put_connection .commit ()
274
-
275
- self .task_connection .executescript (SQL_PRAGMAS )
276
- self .task_connection .commit ()
270
+ self .connection .executescript (SQL_PRAGMAS )
271
+ self .connection .commit ()
277
272
278
273
if inspect :
279
274
return
280
275
281
276
# Setup
282
- with self .global_transaction () as cursor :
277
+ with self .transaction () as cursor :
283
278
if not self .resuming :
284
279
cursor .executescript (SQL_CREATE )
285
280
else :
@@ -302,33 +297,11 @@ def __init__(
302
297
cursor .execute ("VACUUM;" )
303
298
304
299
@contextmanager
305
- def put_transaction (self ):
306
- cursor = None
307
- try :
308
- with self .put_lock , self .put_connection :
309
- cursor = self .put_connection .cursor ()
310
- yield cursor
311
- finally :
312
- if cursor is not None :
313
- cursor .close ()
314
-
315
- @contextmanager
316
- def task_transaction (self ):
317
- cursor = None
318
- try :
319
- with self .task_lock , self .task_connection :
320
- cursor = self .task_connection .cursor ()
321
- yield cursor
322
- finally :
323
- if cursor is not None :
324
- cursor .close ()
325
-
326
- @contextmanager
327
- def global_transaction (self ):
300
+ def transaction (self ):
328
301
cursor = None
329
302
try :
330
- with self .put_lock , self .task_lock , self . task_connection :
331
- cursor = self .task_connection .cursor ()
303
+ with self .transaction_lock , self .connection :
304
+ cursor = self .connection .cursor ()
332
305
yield cursor
333
306
finally :
334
307
if cursor is not None :
@@ -340,7 +313,7 @@ def explain_query_plan(self, sql: str) -> str:
340
313
341
314
sql = sql .replace ("?" , "1" )
342
315
343
- with self .global_transaction () as cursor :
316
+ with self .transaction () as cursor :
344
317
cursor .execute ("EXPLAIN QUERY PLAN %s" % sql )
345
318
346
319
return "\n " .join (row [3 ] for row in iterate_over_sqlite_cursor (cursor ))
@@ -350,14 +323,14 @@ def __count(self, cursor: sqlite3.Cursor) -> int:
350
323
return cursor .fetchone ()[0 ]
351
324
352
325
def qsize (self ) -> int :
353
- with self .global_transaction () as cursor :
326
+ with self .transaction () as cursor :
354
327
return self .__count (cursor )
355
328
356
329
def __len__ (self ) -> int :
357
330
return self .qsize ()
358
331
359
332
def put_many (self , jobs : Iterable [CrawlJob ]) -> int :
360
- with self .put_transaction () as cursor :
333
+ with self .transaction () as cursor :
361
334
rows = []
362
335
363
336
for job in jobs :
@@ -422,7 +395,7 @@ def get(self, block=True) -> CrawlJob:
422
395
if self .broken :
423
396
raise BrokenCrawlerQueue
424
397
425
- with self .task_transaction () as cursor :
398
+ with self .transaction () as cursor :
426
399
cursor .execute (
427
400
SQL_GET_JOB % ("ASC" if not self .is_lifo else "DESC" ),
428
401
(now (),),
@@ -497,7 +470,7 @@ def unblock(self) -> None:
497
470
def worked_groups (self ) -> Dict [str , Tuple [int , int ]]:
498
471
g = {}
499
472
500
- with self .global_transaction () as cursor :
473
+ with self .transaction () as cursor :
501
474
cursor .execute (
502
475
'SELECT "group", "count", "allowed" FROM "parallelism" WHERE "count" > 0;'
503
476
)
@@ -508,7 +481,7 @@ def worked_groups(self) -> Dict[str, Tuple[int, int]]:
508
481
return g
509
482
510
483
def dump (self ) -> Iterator [CrawlerQueueRecord ]:
511
- with self .global_transaction () as cursor :
484
+ with self .transaction () as cursor :
512
485
cursor .execute (SQL_DUMP )
513
486
514
487
for row in iterate_over_sqlite_cursor (cursor ):
@@ -554,11 +527,11 @@ def __vacuum_and_analyze(self, cursor: sqlite3.Cursor) -> None:
554
527
cursor .connection .commit ()
555
528
556
529
def cleanup (self ) -> None :
557
- with self .global_transaction () as cursor :
530
+ with self .transaction () as cursor :
558
531
self .__cleanup (cursor )
559
532
560
533
def clear (self ) -> None :
561
- with self .global_transaction () as cursor :
534
+ with self .transaction () as cursor :
562
535
self .__clear (cursor )
563
536
564
537
# NOTE: there is a subtle difference between #.release_group
@@ -570,7 +543,7 @@ def clear(self) -> None:
570
543
# task. This is important to ensure atomicity as well as possible.
571
544
# Without creating backpressure on the queue's constraints.
572
545
def release_group (self , job : CrawlJob ) -> None :
573
- with self .task_transaction () as cursor :
546
+ with self .transaction () as cursor :
574
547
if job .id not in self .tasks :
575
548
raise RuntimeError ("job is not being worked" )
576
549
@@ -601,7 +574,7 @@ def group_releaser(self, job: CrawlJob):
601
574
self .release_group (job )
602
575
603
576
def task_done (self , job : CrawlJob ) -> None :
604
- with self .task_transaction () as cursor :
577
+ with self .transaction () as cursor :
605
578
index = self .tasks .pop (job .id , None )
606
579
607
580
if index is None :
@@ -620,8 +593,7 @@ def task_done(self, job: CrawlJob) -> None:
620
593
self .__vacuum_and_analyze (cursor )
621
594
622
595
def close (self ) -> None :
623
- self .put_connection .close ()
624
- self .task_connection .close ()
596
+ self .connection .close ()
625
597
626
598
def __del__ (self ) -> None :
627
599
self .close ()
0 commit comments