Skip to content

Commit 1b6ce5e

Browse files
authored
Merge pull request cyrusimap#5123 from elliefm/v311/master-cleanup-after-crashed-service
clean up proc entries for crashed services
2 parents e101315 + c722772 commit 1b6ce5e

File tree

4 files changed

+79
-1
lines changed

4 files changed

+79
-1
lines changed

cassandane/Cassandane/Cyrus/Info.pm

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use warnings;
4343
use Cwd qw(realpath);
4444
use Data::Dumper;
4545
use Date::Format qw(time2str);
46+
use Time::HiRes qw(usleep);
4647

4748
use lib '.';
4849
use base qw(Cassandane::Cyrus::TestCase);
@@ -296,6 +297,63 @@ sub test_proc_services
296297
}
297298
}
298299

300+
sub test_proc_crashed_services
301+
{
302+
my ($self) = @_;
303+
304+
# no clients => no service daemons => no processes
305+
my @output = $self->{instance}->run_cyr_info('proc');
306+
$self->assert_num_equals(0, scalar @output);
307+
308+
# master spawns service processes when clients connect to them
309+
my $imap_svc = $self->{instance}->get_service('imap');
310+
my @clients;
311+
foreach (1..5) {
312+
# five concurrent connections for a single user is normal,
313+
# e.g. thunderbird does this
314+
my $store = $imap_svc->create_store(username => 'cassandane');
315+
my $imaptalk = $store->get_client();
316+
push @clients, $imaptalk if $imaptalk;
317+
}
318+
319+
# better have got some clients from that!
320+
$self->assert_num_gte(1, scalar @clients);
321+
322+
# five clients => five service daemons => five processes
323+
@output = $self->{instance}->run_cyr_info('proc');
324+
$self->assert_num_equals(scalar @clients, scalar @output);
325+
326+
my @pids = sort map { (split /\s+/, $_, 2)[0] } @output;
327+
$self->assert_num_equals(scalar @clients, scalar @pids);
328+
329+
# crash service processes one at a time, expect proc count to decrease
330+
while (scalar @pids) {
331+
my $pid = shift @pids;
332+
kill 'SEGV', $pid;
333+
usleep 250_000;
334+
335+
my @cores = $self->{instance}->find_cores();
336+
if (@cores) {
337+
# if we dumped core, there'd better only be one core file
338+
$self->assert_num_equals(1, scalar @cores);
339+
340+
# don't barf on it existing during shutdown
341+
unlink $cores[0];
342+
}
343+
344+
@output = $self->{instance}->run_cyr_info('proc');
345+
$self->assert_num_equals(scalar @pids, scalar @output);
346+
}
347+
348+
# prevent a lot of "Connection closed by other end" noise by claiming
349+
# and discarding the client's socket before its DESTROY is called
350+
while (scalar @clients) {
351+
my $old = shift @clients;
352+
353+
$old->release_socket(1);
354+
}
355+
}
356+
299357
sub test_proc_starts
300358
:NoStartInstances :needs_component_idled
301359
{

lib/proc.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,16 @@ EXPORTED void proc_cleanup(struct proc_handle **handlep)
225225
}
226226
}
227227

228+
/* used by master to remove proc files after service processes crash */
229+
EXPORTED void proc_force_cleanup(pid_t pid)
230+
{
231+
char *fname = proc_getpath(pid, /*isnew*/0);
232+
233+
if (fname)
234+
xunlink(fname);
235+
free(fname);
236+
}
237+
228238
static int proc_foreach_helper(pid_t pid, procdata_t *func, void *rock)
229239
{
230240
int r = 0;

lib/proc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ extern int proc_register(struct proc_handle **handlep,
5252
const char *mailbox,
5353
const char *cmd);
5454
extern void proc_cleanup(struct proc_handle **handlep);
55+
extern void proc_force_cleanup(pid_t pid);
5556

5657
typedef int procdata_t(pid_t pid,
5758
const char *servicename, const char *clienthost,

master/master.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1525,7 +1525,16 @@ static void reap_child(void)
15251525
}
15261526
}
15271527

1528-
proc_cleanup(&c->proc_handle);
1528+
if (s && !in_shutdown && failed) {
1529+
/* we don't have a proc_handle for service processes because
1530+
* they manage it themselves, but if one crashed it won't have
1531+
* cleaned it up
1532+
*/
1533+
proc_force_cleanup(pid);
1534+
}
1535+
else {
1536+
proc_cleanup(&c->proc_handle);
1537+
}
15291538
centry_set_state(c, SERVICE_STATE_DEAD);
15301539
} else {
15311540
/* Are we multithreaded now? we don't know this child */

0 commit comments

Comments
 (0)