Skip to content

Commit 8b4edf5

Browse files
refactor: reap zombie process and opt-out with no-reap flag (#172)
* fix: random waitid error fix #171 * fix(reap): forward signal * refactor: modify reaper to get supercronic exitStatus * fix(reaper): unify signal list & fix signal forward * chore: replace ioutil to io * fix(test): ci timeout * opt-out with no-reap flag Co-authored-by: Josh Raker <[email protected]> * fix: typo on signal * fix: args pass to supercronic * fix(test): remove removed flag * chore: remove misleading comment --------- Co-authored-by: Josh Raker <[email protected]>
1 parent bfb0d97 commit 8b4edf5

File tree

9 files changed

+176
-22
lines changed

9 files changed

+176
-22
lines changed

cron/cron_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package cron
33
import (
44
"context"
55
"fmt"
6-
"io/ioutil"
6+
"io"
77
"regexp"
88
"strings"
99
"sync"
@@ -41,7 +41,7 @@ func (hook *testHook) Levels() []logrus.Level {
4141

4242
func newTestLogger() (*logrus.Entry, chan *logrus.Entry) {
4343
logger := logrus.New()
44-
logger.Out = ioutil.Discard
44+
logger.Out = io.Discard
4545
logger.Level = logrus.DebugLevel
4646

4747
channel := make(chan *logrus.Entry, TEST_CHANNEL_BUFFER_SIZE)

go.mod

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ require (
66
github.com/evalphobia/logrus_sentry v0.8.2
77
github.com/fsnotify/fsnotify v1.7.0
88
github.com/prometheus/client_golang v1.20.2
9-
github.com/ramr/go-reaper v0.2.1
109
github.com/sirupsen/logrus v1.9.3
1110
github.com/stretchr/testify v1.9.0
1211
)

go.sum

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ github.com/prometheus/common v0.57.0 h1:Ro/rKjwdq9mZn1K5QPctzh+MA4Lp0BuYk5ZZEVho
3838
github.com/prometheus/common v0.57.0/go.mod h1:7uRPFSUTbfZWsJ7MHY56sqt7hLQu3bxXHDnNhl8E9qI=
3939
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
4040
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
41-
github.com/ramr/go-reaper v0.2.1 h1:zww+wlQOvTjBZuk1920R/e0GFEb6O7+B0WQLV6dM924=
42-
github.com/ramr/go-reaper v0.2.1/go.mod h1:AVypdzrcCXjSc/JYnlXl8TsB+z84WyFzxWE8Jh0MOJc=
4341
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
4442
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
4543
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=

integration/normal.crontab

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* * * * * * * echo 1

integration/test.bats

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,34 @@ wait_for() {
102102
! run_supercronic -test "${BATS_TEST_DIRNAME}/invalid.crontab"
103103
}
104104

105-
@test "reap zombie process" {
105+
@test "it run as pid 1 and reap zombie process" {
106+
out="${WORK_DIR}/zombie-crontab-out"
107+
106108
# run in new process namespace
107109
sudo timeout 10s unshare --fork --pid --mount-proc \
108-
${BATS_TEST_DIRNAME}/../supercronic "${BATS_TEST_DIRNAME}/zombie.crontab" &
110+
${BATS_TEST_DIRNAME}/../supercronic "${BATS_TEST_DIRNAME}/zombie.crontab" >"$out" 2>&1 &
109111
local pid=$!
110-
sleep 1.5
111-
run bash -c "ps axo pid=,stat=|grep Z"
112+
sleep 3
113+
112114
kill -TERM ${pid}
113-
114-
[[ "$status" -eq 1 ]]
115+
# todo: use other method to detect zombie cleanup
116+
wait_for grep "reaper cleanup: pid=" "$out"
117+
}
118+
119+
120+
@test "it run as pid 1 and normal crontab no error" {
121+
out="${WORK_DIR}/normal-crontab-out"
122+
123+
# sleep 30 seconds occur found bug
124+
# FIXME: other way to detect
125+
sudo timeout 30s unshare --fork --pid --mount-proc \
126+
"${BATS_TEST_DIRNAME}/../supercronic" "${BATS_TEST_DIRNAME}/normal.crontab" >"$out" 2>&1 &
127+
# https://github.com/aptible/supercronic/issues/171
128+
local pid=$!
129+
local foundErr
130+
131+
sleep 29.5
132+
kill -TERM ${pid}
133+
grep "waitid: no child processes" "$out" && foundErr=1
134+
[[ $foundErr != 1 ]]
115135
}

integration/zombie.crontab

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
* * * * * * * /bin/sleep 1 & exec /bin/sleep 0
1+
* * * * * * * /bin/sleep 0.1 & exec /bin/sleep 0

log/hook/splitstream.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package hook
22

33
import (
4-
"github.com/sirupsen/logrus"
54
"io"
6-
"io/ioutil"
5+
6+
"github.com/sirupsen/logrus"
77
)
88

99
type writerHook struct {
@@ -25,7 +25,7 @@ func (h *writerHook) Fire(entry *logrus.Entry) error {
2525
}
2626

2727
func RegisterSplitLogger(logger *logrus.Logger, outWriter io.Writer, errWriter io.Writer) {
28-
logger.SetOutput(ioutil.Discard)
28+
logger.SetOutput(io.Discard)
2929

3030
logger.AddHook(&writerHook{
3131
writer: outWriter,

main.go

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616
"github.com/aptible/supercronic/prometheus_metrics"
1717
"github.com/evalphobia/logrus_sentry"
1818
"github.com/fsnotify/fsnotify"
19-
reaper "github.com/ramr/go-reaper"
2019
"github.com/sirupsen/logrus"
2120
)
2221

@@ -31,6 +30,8 @@ func main() {
3130
json := flag.Bool("json", false, "enable JSON logging")
3231
test := flag.Bool("test", false, "test crontab (does not run jobs)")
3332
inotify := flag.Bool("inotify", false, "use inotify to detect crontab file changes")
33+
// If this flag changes, update forkExec to disable reaping in the child process
34+
disableReap := flag.Bool("no-reap", false, "disable reaping of dead processes, note: reaping requires pid 1")
3435
prometheusListen := flag.String(
3536
"prometheus-listen-address",
3637
"",
@@ -101,7 +102,19 @@ func main() {
101102
os.Exit(2)
102103
return
103104
}
104-
105+
if !*disableReap {
106+
if os.Getpid() == 1 {
107+
// Clean up zombie processes caused by incorrect crontab commands
108+
// Use forkExec to avoid random waitid errors
109+
// https://github.com/aptible/supercronic/issues/88
110+
// https://github.com/aptible/supercronic/issues/171
111+
logrus.Info("reaping dead processes")
112+
forkExec()
113+
return
114+
}
115+
116+
logrus.Warn("process reaping disabled, not pid 1")
117+
}
105118
crontabFileName := flag.Args()[0]
106119

107120
var watcher *fsnotify.Watcher
@@ -165,12 +178,8 @@ func main() {
165178
}()
166179
}
167180

168-
// Start background reaping of orphaned child processes.
169-
go reaper.Reap()
170-
// _ = reaper.Reap
171-
172181
termChan := make(chan os.Signal, 1)
173-
signal.Notify(termChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGUSR2)
182+
signal.Notify(termChan, signalList...)
174183

175184
if *inotify {
176185
go func() {
@@ -266,3 +275,7 @@ func readCrontabAtPath(path string) (*crontab.Crontab, error) {
266275

267276
return crontab.ParseCrontab(file)
268277
}
278+
279+
var signalList = []os.Signal{
280+
syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGUSR2,
281+
}

reaper.go

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package main
2+
3+
import (
4+
"os"
5+
"os/signal"
6+
"syscall"
7+
8+
"github.com/sirupsen/logrus"
9+
)
10+
11+
func forkExec() {
12+
13+
// run supercronic in other process
14+
pwd, err := os.Getwd()
15+
if err != nil {
16+
logrus.Fatalf("Failed to get current working directory: %s", err.Error())
17+
return
18+
}
19+
20+
pattrs := &syscall.ProcAttr{
21+
Dir: pwd,
22+
Env: os.Environ(),
23+
Files: []uintptr{
24+
uintptr(syscall.Stdin),
25+
uintptr(syscall.Stdout),
26+
uintptr(syscall.Stderr),
27+
},
28+
}
29+
args := make([]string, 0, len(os.Args)+1)
30+
// disable reaping for supercronic, avoid no sense warning
31+
args = append(args, os.Args[0], "-no-reap")
32+
args = append(args, os.Args[1:]...)
33+
34+
pid, err := syscall.ForkExec(args[0], args, pattrs)
35+
if err != nil {
36+
logrus.Fatalf("Failed to fork exec: %s", err.Error())
37+
return
38+
}
39+
40+
// forward signal to supercronic
41+
signalToFork(pid)
42+
// got supercronic exit status
43+
wstatus := reapChildren(pid)
44+
os.Exit(wstatus.ExitStatus())
45+
}
46+
47+
func signalToFork(pid int) {
48+
p, err := os.FindProcess(pid)
49+
if err != nil {
50+
logrus.Fatalf("Failed findProcess supercronic pid:%d,%s", pid, err.Error())
51+
}
52+
termChan := make(chan os.Signal, 1)
53+
signal.Notify(termChan, signalList...)
54+
go func() {
55+
for {
56+
s := <-termChan
57+
if err := p.Signal(s); err != nil {
58+
logrus.Errorf("Failed to send signal to supercronic: %s", err.Error())
59+
}
60+
}
61+
}()
62+
}
63+
64+
// copy from https://github.com/ramr/go-reaper
65+
// modify for wait exit status of supercronic
66+
// without modify, supercronic exit status may not be obtained
67+
68+
// Be a good parent - clean up behind the children.
69+
func reapChildren(superCrondPid int) syscall.WaitStatus {
70+
var notifications = make(chan os.Signal, 1)
71+
72+
go sigChildHandler(notifications)
73+
74+
// all child
75+
const rpid = -1
76+
var wstatus syscall.WaitStatus
77+
78+
for {
79+
var sig = <-notifications
80+
logrus.Debugf("reaper received signal %v\n", sig)
81+
for {
82+
pid, err := syscall.Wait4(rpid, &wstatus, 0, nil)
83+
for syscall.EINTR == err {
84+
pid, err = syscall.Wait4(pid, &wstatus, 0, nil)
85+
}
86+
87+
if syscall.ECHILD == err {
88+
break
89+
}
90+
91+
if superCrondPid == pid {
92+
logrus.Debugf("supercronic exit, pid=%d, wstatus=%+v, err=%+v\n", pid, wstatus, err)
93+
return wstatus
94+
}
95+
// note: change output need change test
96+
logrus.Warnf("reaper cleanup: pid=%d, wstatus=%+v\n",
97+
pid, wstatus)
98+
}
99+
}
100+
101+
}
102+
103+
// Handle death of child (SIGCHLD) messages. Pushes the signal onto the
104+
// notifications channel if there is a waiter.
105+
func sigChildHandler(notifications chan os.Signal) {
106+
var sigs = make(chan os.Signal, 3)
107+
signal.Notify(sigs, syscall.SIGCHLD)
108+
109+
for {
110+
var sig = <-sigs
111+
select {
112+
case notifications <- sig: /* published it. */
113+
default:
114+
/*
115+
* Notifications channel full - drop it to the
116+
* floor. This ensures we don't fill up the SIGCHLD
117+
* queue. The reaper just waits for any child
118+
* process (pid=-1), so we ain't loosing it!! ;^)
119+
*/
120+
}
121+
}
122+
123+
} /* End of function sigChildHandler. */

0 commit comments

Comments
 (0)