Skip to content

Commit 67508e9

Browse files
pnfsmanager: inroduce limit on number of concurrent listing of the same directory
Motivation ---------- When many jobs run listings of large directories in parallel thread starvation occurs, the list queue gets filled, the message queue gets filled and PnfsManager becomes unresponsive. Modification: ------------- Before dispatching list request to a queue count how many list request are already in the list queue. Add property: pnfsmanager.limits.number-of-concurrent-dir-listings = infinity that controls number of list requests in the PnfsManager list queue. If this number is exceeded PnfsManager refuses to serve the request. Result: ------ PnfsManager is more resilient against flood of concurrent list requests. Patch: https://rb.dcache.org/r/14098/ Acked-by: Albert Rossi Target: trunk Request: 9.2 Require-notes: yes
1 parent c1e3728 commit 67508e9

File tree

4 files changed

+48
-12
lines changed

4 files changed

+48
-12
lines changed

modules/dcache-chimera/src/main/resources/diskCacheV111/namespace/pnfsmanager-chimera.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
<property name="logSlowThreshold" value="${pnfsmanager.limits.log-slow-threshold}"/>
3737
<property name="folding" value="${pnfsmanager.enable.folding}"/>
3838
<property name="useParallelListing" value="${pnfsmanager.enable.parallel-listing}"/>
39+
<property name="maxListRequestsInQueue"
40+
value="#{T(org.dcache.util.Strings).parseInt('${pnfsmanager.limits.number-of-concurrent-dir-listings}')}"/>
3941
<property name="directoryListLimit" value="${pnfsmanager.limits.list-chunk-size}"/>
4042
<property name="permissionHandler" ref="permission-handler"/>
4143
<property name="queueMaxSize" value="${pnfsmanager.limits.queue-length}"/>

modules/dcache/src/main/java/diskCacheV111/namespace/PnfsManagerV3.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ public class PnfsManagerV3
227227

228228
private boolean useParentHashOnCreate;
229229
private boolean useParallelListing;
230+
private int maxListRequestsInQueue;
230231

231232
/**
232233
* Whether to use folding.
@@ -343,6 +344,11 @@ public void setUseParallelListing(boolean useParallelListing) {
343344
this.useParallelListing = useParallelListing;
344345
}
345346

347+
@Required
348+
public void setMaxListRequestsInQueue(int maxListRequestsInQueue) {
349+
this.maxListRequestsInQueue = maxListRequestsInQueue;
350+
}
351+
346352
@Required
347353
public void setScheduledExecutor(ScheduledExecutorService executor) {
348354
scheduledExecutor = executor;
@@ -2810,21 +2816,40 @@ public void messageArrived(CellMessage envelope, PnfsListDirectoryMessage messag
28102816
throws CacheException {
28112817

28122818
String path = message.getPnfsPath();
2819+
28132820
if (path == null) {
28142821
throw new InvalidMessageCacheException("Missing PNFS id and path");
28152822
}
28162823

2824+
/**
2825+
* when useParallelListing is true, we only have 1 queue in the
2826+
* list of queues below
2827+
*/
28172828
int index = 0;
28182829

28192830
if (!useParallelListing) {
28202831
index = (int)(Math.abs((long)Objects.hashCode(path.toString())) % _listThreads);
28212832
}
2833+
BlockingQueue<CellMessage> queue = _listQueues[index];
28222834

2823-
/**
2824-
* when useParallelListing is true, we only have 1 queue in the
2825-
* list of queues below
2826-
*/
2827-
if (!_listQueues[index].offer(envelope)) {
2835+
/**
2836+
* Do counts only if maxListRequestsInQueue is enabled
2837+
*/
2838+
if (maxListRequestsInQueue < Integer.MAX_VALUE) {
2839+
int counter = 0;
2840+
for (CellMessage i : queue) {
2841+
PnfsListDirectoryMessage msg = (PnfsListDirectoryMessage)i.getMessageObject();
2842+
if (msg.getPnfsPath().equals(path)) {
2843+
if (counter > maxListRequestsInQueue) {
2844+
LOGGER.warn("Too many list requests for the same directory {} in PnfsManager queue", path);
2845+
throw new MissingResourceCacheException("Too many list requests for the same directory in PndsManager queue");
2846+
}
2847+
counter += 1;
2848+
}
2849+
}
2850+
}
2851+
2852+
if (!queue.offer(envelope)) {
28282853
throw new MissingResourceCacheException("PnfsManager queue limit exceeded");
28292854
}
28302855
}

skel/share/defaults/pnfsmanager.properties

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,14 @@ pnfsmanager.limits.threads = ${pnfsmanager.limits.threads-per-group}
7575
# ---- Thread displatch mechanisms
7676
#
7777
# Experimental feature. Normally message processing is dispatched
78-
# to the same thread in the thread pool associated with pnfsid (or path)
79-
# of the namespace entry contained in the messagee. On massive
80-
# uploads (create entries) to a single directory we observed
78+
# to the same thread in the thread pool associated with pnfsid (or path)
79+
# of the namespace entry contained in the messagee. On massive
80+
# uploads (create entries) to a single directory we observed
8181
# performance degradation caused by undelrying db back-end
8282
# synchronization when updating mtime and link count of the target
83-
# directory. This leads to all available threads being busy/hanging
84-
# processing create entry messages denying other users from
85-
# accessing the namespace. The switch below, if enabled, would cause
83+
# directory. This leads to all available threads being busy/hanging
84+
# processing create entry messages denying other users from
85+
# accessing the namespace. The switch below, if enabled, would cause
8686
# the create mesages to be dispatched to a thread associated
8787
# with that entry's parent (that is the target directory).
8888
#
@@ -126,7 +126,15 @@ pnfsmanager.limits.list-chunk-size = 100
126126

127127
(one-of?true|false)pnfsmanager.enable.parallel-listing = false
128128

129-
129+
# ---- Determines how many simultaneous same directory listings to allow
130+
#
131+
# If number of allowed simultaneous listings of the same directory
132+
# reaches the limit defined below the PnfsManager will reject
133+
# new listing requests for that directory until the number of already
134+
# queued and active listing requests for that directory drops
135+
# below this number.
136+
#
137+
pnfsmanager.limits.number-of-concurrent-dir-listings = infinity
130138

131139
# ---- Threshold for when to log slow requests
132140
#

skel/share/services/pnfsmanager.batch

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ check -strong pnfsmanager.enable.acl
1010
check -strong pnfsmanager.default-retention-policy
1111
check -strong pnfsmanager.default-access-latency
1212
check -strong pnfsmanager.enable.parallel-listing
13+
check -strong pnfsmanager.limits.number-of-concurrent-dir-listings
1314
check pnfsmanager.destination.flush-notification
1415
check pnfsmanager.destination.cache-notification
1516
check pnfsmanager.destination.cancel-upload-notification

0 commit comments

Comments
 (0)