feat: allow graceful shutdowns #546

Merged
lunny merged 3 commits from rowan-allspice/act_runner:graceful-shutdown into main 2024-05-27 07:38:56 +00:00
5 changed files with 87 additions and 18 deletions

@ -122,8 +122,18 @@ func runDaemon(ctx context.Context, configFile *string) func(cmd *cobra.Command,
poller := poll.New(cfg, cli, runner)
poller.Poll(ctx)
go poller.Poll()
<-ctx.Done()
log.Infof("runner: %s shutdown initiated, waiting %s for running jobs to complete before shutting down", resp.Msg.Runner.Name, cfg.Runner.ShutdownTimeout)
ctx, cancel := context.WithTimeout(context.Background(), cfg.Runner.ShutdownTimeout)

Not sure about this, since it's context.Background(), how to make it works like "If the context is cancelled, it will then force all jobs to end, and then exit"?

I think it could be like: the first signal to tell the runner to stop taking new tasks and wait for the current task to finish, the second signal to tell it to force stop current tasks but let them report errors.
(Not idea, see the new comment.)

Not sure about this, since it's `context.Background()`, how to make it works like "If the context is cancelled, it will then force all jobs to end, and then exit"? ~~I think it could be like: the first signal to tell the runner to stop taking new tasks and wait for the current task to finish, the second signal to tell it to force stop current tasks but let them report errors.~~ (Not idea, see the new comment.)

The specific context here can be changed depending on the end design. The shutdown logic here was meant to mirror the same API provided by http.Server.Shutdown. Providing a context.Background() here essentially just makes the Poller wait indefinitely to for running jobs to finish.

The specific context here can be changed depending on the end design. The shutdown logic here was meant to mirror the same API provided by [`http.Server.Shutdown`](https://pkg.go.dev/net/http#Server.Shutdown). Providing a `context.Background()` here essentially just makes the `Poller` wait indefinitely to for running jobs to finish.
defer cancel()
err = poller.Shutdown(ctx)
if err != nil {
log.Warnf("runner: %s cancelled in progress jobs during shutdown", resp.Msg.Runner.Name)
}
return nil
}
}

@ -25,40 +25,95 @@ type Poller struct {
runner *run.Runner
cfg *config.Config
tasksVersion atomic.Int64 // tasksVersion used to store the version of the last task fetched from the Gitea.
pollingCtx context.Context
shutdownPolling context.CancelFunc
jobsCtx context.Context
shutdownJobs context.CancelFunc
done chan struct{}
}
func New(cfg *config.Config, client client.Client, runner *run.Runner) *Poller {
pollingCtx, shutdownPolling := context.WithCancel(context.Background())
jobsCtx, shutdownJobs := context.WithCancel(context.Background())
done := make(chan struct{})
return &Poller{
client: client,
runner: runner,
cfg: cfg,
pollingCtx: pollingCtx,
shutdownPolling: shutdownPolling,
jobsCtx: jobsCtx,
shutdownJobs: shutdownJobs,
done: done,
}
}
func (p *Poller) Poll(ctx context.Context) {
func (p *Poller) Poll() {
limiter := rate.NewLimiter(rate.Every(p.cfg.Runner.FetchInterval), 1)
wg := &sync.WaitGroup{}
for i := 0; i < p.cfg.Runner.Capacity; i++ {
wg.Add(1)
go p.poll(ctx, wg, limiter)
go p.poll(wg, limiter)
}
wg.Wait()
// signal that we shutdown
close(p.done)
}
func (p *Poller) poll(ctx context.Context, wg *sync.WaitGroup, limiter *rate.Limiter) {
func (p *Poller) Shutdown(ctx context.Context) error {
p.shutdownPolling()
select {
// graceful shutdown completed succesfully
case <-p.done:
return nil
// our timeout for shutting down ran out
case <-ctx.Done():
// when both the timeout fires and the graceful shutdown
// completed succsfully, this branch of the select may
// fire. Do a non-blocking check here against the graceful
// shutdown status to avoid sending an error if we don't need to.
_, ok := <-p.done
if !ok {
return nil
}
// force a shutdown of all running jobs
p.shutdownJobs()
// wait for running jobs to report their status to Gitea
_, _ = <-p.done
return ctx.Err()
}
}
func (p *Poller) poll(wg *sync.WaitGroup, limiter *rate.Limiter) {
defer wg.Done()
for {
if err := limiter.Wait(ctx); err != nil {
if ctx.Err() != nil {
if err := limiter.Wait(p.pollingCtx); err != nil {
if p.pollingCtx.Err() != nil {
log.WithError(err).Debug("limiter wait failed")
}
return
}
task, ok := p.fetchTask(ctx)
task, ok := p.fetchTask(p.pollingCtx)
if !ok {
continue
}
p.runTaskWithRecover(ctx, task)
p.runTaskWithRecover(p.jobsCtx, task)
}
}

@ -23,6 +23,9 @@ runner:
# Please note that the Gitea instance also has a timeout (3h by default) for the job.
# So the job could be stopped by the Gitea instance if it's timeout is shorter than this.
timeout: 3h
# The timeout for the runner to wait for running jobs to finish when shutting down.
# Any running jobs that haven't finished after this timeout will be cancelled.
shutdown_timeout: 0s
# Whether skip verifying the TLS certificate of the Gitea instance.
insecure: false
# The timeout for fetching the job from the Gitea instance.

@ -21,15 +21,16 @@ type Log struct {
// Runner represents the configuration for the runner.
type Runner struct {
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
}
// Cache represents the configuration for caching.

@ -54,4 +54,4 @@ fi
unset GITEA_RUNNER_REGISTRATION_TOKEN
unset GITEA_RUNNER_REGISTRATION_TOKEN_FILE
act_runner daemon ${CONFIG_ARG}
exec act_runner daemon ${CONFIG_ARG}