Pause queues (#15928)

* Start adding mechanism to return unhandled data Signed-off-by: Andrew Thornton <art27@cantab.net> * Create pushback interface Signed-off-by: Andrew Thornton <art27@cantab.net> * Add Pausable interface to WorkerPool and Manager Signed-off-by: Andrew Thornton <art27@cantab.net> * Implement Pausable and PushBack for the bytefifos Signed-off-by: Andrew Thornton <art27@cantab.net> * Implement Pausable and Pushback for ChannelQueues and ChannelUniqueQueues Signed-off-by: Andrew Thornton <art27@cantab.net> * Wire in UI for pausing Signed-off-by: Andrew Thornton <art27@cantab.net> * add testcases and fix a few issues Signed-off-by: Andrew Thornton <art27@cantab.net> * fix build Signed-off-by: Andrew Thornton <art27@cantab.net> * prevent "race" in the test Signed-off-by: Andrew Thornton <art27@cantab.net> * fix jsoniter mismerge Signed-off-by: Andrew Thornton <art27@cantab.net> * fix conflicts Signed-off-by: Andrew Thornton <art27@cantab.net> * fix format Signed-off-by: Andrew Thornton <art27@cantab.net> * Add warnings for no worker configurations and prevent data-loss with redis/levelqueue Signed-off-by: Andrew Thornton <art27@cantab.net> * Use StopTimer Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Lauris BH <lauris@nix.lv> Co-authored-by: 6543 <6543@obermui.de> Co-authored-by: techknowlogick <techknowlogick@gitea.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
2022-01-22 21:22:14 +00:00 · 2022-01-22 21:22:14 +00:00 · a82fd98d53
commit a82fd98d53
parent 27ee01e1e8
34 changed files with 1389 additions and 122 deletions
--- a/modules/queue/workerpool.go
+++ b/modules/queue/workerpool.go
@ -22,6 +22,8 @@ type WorkerPool struct {
 	lock               sync.Mutex
 	baseCtx            context.Context
 	baseCtxCancel      context.CancelFunc
+	paused             chan struct{}
+	resumed            chan struct{}
 	cond               *sync.Cond
 	qid                int64
 	maxNumberOfWorkers int
@ -35,6 +37,11 @@ type WorkerPool struct {
 	numInQueue         int64
 }

+var (
+	_ Flushable   = &WorkerPool{}
+	_ ManagedPool = &WorkerPool{}
+)
+
 // WorkerPoolConfiguration is the basic configuration for a WorkerPool
 type WorkerPoolConfiguration struct {
 	QueueLength  int
@ -50,11 +57,15 @@ func NewWorkerPool(handle HandlerFunc, config WorkerPoolConfiguration) *WorkerPo
 	ctx, cancel := context.WithCancel(context.Background())

 	dataChan := make(chan Data, config.QueueLength)
+	resumed := make(chan struct{})
+	close(resumed)
 	pool := &WorkerPool{
 		baseCtx:            ctx,
 		baseCtxCancel:      cancel,
 		batchLength:        config.BatchLength,
 		dataChan:           dataChan,
+		resumed:            resumed,
+		paused:             make(chan struct{}),
 		handle:             handle,
 		blockTimeout:       config.BlockTimeout,
 		boostTimeout:       config.BoostTimeout,
@ -69,6 +80,14 @@ func NewWorkerPool(handle HandlerFunc, config WorkerPoolConfiguration) *WorkerPo
 func (p *WorkerPool) Push(data Data) {
 	atomic.AddInt64(&p.numInQueue, 1)
 	p.lock.Lock()
+	select {
+	case <-p.paused:
+		p.lock.Unlock()
+		p.dataChan <- data
+		return
+	default:
+	}
+
 	if p.blockTimeout > 0 && p.boostTimeout > 0 && (p.numberOfWorkers <= p.maxNumberOfWorkers || p.maxNumberOfWorkers < 0) {
 		if p.numberOfWorkers == 0 {
 			p.zeroBoost()
@ -82,6 +101,17 @@ func (p *WorkerPool) Push(data Data) {
 	}
 }

+// HasNoWorkerScaling will return true if the queue has no workers, and has no worker boosting
+func (p *WorkerPool) HasNoWorkerScaling() bool {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	return p.hasNoWorkerScaling()
+}
+
+func (p *WorkerPool) hasNoWorkerScaling() bool {
+	return p.numberOfWorkers == 0 && (p.boostTimeout == 0 || p.boostWorkers == 0 || p.maxNumberOfWorkers == 0)
+}
+
 func (p *WorkerPool) zeroBoost() {
 	ctx, cancel := context.WithTimeout(p.baseCtx, p.boostTimeout)
 	mq := GetManager().GetManagedQueue(p.qid)
@ -272,6 +302,12 @@ func (p *WorkerPool) addWorkers(ctx context.Context, cancel context.CancelFunc,
 				p.cond.Broadcast()
 				cancel()
 			}
+			if p.hasNoWorkerScaling() {
+				log.Warn(
+					"Queue: %d is configured to be non-scaling and has no workers - this configuration is likely incorrect.\n"+
+						"The queue will be paused to prevent data-loss with the assumption that you will add workers and unpause as required.", p.qid)
+				p.pause()
+			}
 			p.lock.Unlock()
 		}()
 	}
@ -290,13 +326,65 @@ func (p *WorkerPool) Wait() {
 	p.cond.Wait()
 }

+// IsPaused returns if the pool is paused
+func (p *WorkerPool) IsPaused() bool {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	select {
+	case <-p.paused:
+		return true
+	default:
+		return false
+	}
+}
+
+// IsPausedIsResumed returns if the pool is paused and a channel that is closed when it is resumed
+func (p *WorkerPool) IsPausedIsResumed() (<-chan struct{}, <-chan struct{}) {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	return p.paused, p.resumed
+}
+
+// Pause pauses the WorkerPool
+func (p *WorkerPool) Pause() {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	p.pause()
+}
+
+func (p *WorkerPool) pause() {
+	select {
+	case <-p.paused:
+	default:
+		p.resumed = make(chan struct{})
+		close(p.paused)
+	}
+}
+
+// Resume resumes the WorkerPool
+func (p *WorkerPool) Resume() {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	select {
+	case <-p.resumed:
+	default:
+		p.paused = make(chan struct{})
+		close(p.resumed)
+	}
+}
+
 // CleanUp will drain the remaining contents of the channel
 // This should be called after AddWorkers context is closed
 func (p *WorkerPool) CleanUp(ctx context.Context) {
 	log.Trace("WorkerPool: %d CleanUp", p.qid)
 	close(p.dataChan)
 	for data := range p.dataChan {
-		p.handle(data)
+		if unhandled := p.handle(data); unhandled != nil {
+			if unhandled != nil {
+				log.Error("Unhandled Data in clean-up of queue %d", p.qid)
+			}
+		}
+
 		atomic.AddInt64(&p.numInQueue, -1)
 		select {
 		case <-ctx.Done():
@ -327,7 +415,9 @@ func (p *WorkerPool) FlushWithContext(ctx context.Context) error {
 	for {
 		select {
 		case data := <-p.dataChan:
-			p.handle(data)
+			if unhandled := p.handle(data); unhandled != nil {
+				log.Error("Unhandled Data whilst flushing queue %d", p.qid)
+			}
 			atomic.AddInt64(&p.numInQueue, -1)
 		case <-p.baseCtx.Done():
 			return p.baseCtx.Err()
@ -341,13 +431,45 @@ func (p *WorkerPool) FlushWithContext(ctx context.Context) error {

 func (p *WorkerPool) doWork(ctx context.Context) {
 	delay := time.Millisecond * 300
+
+	// Create a common timer - we will use this elsewhere
+	timer := time.NewTimer(0)
+	util.StopTimer(timer)
+
+	paused, _ := p.IsPausedIsResumed()
 	data := make([]Data, 0, p.batchLength)
 	for {
 		select {
+		case <-paused:
+			log.Trace("Worker for Queue %d Pausing", p.qid)
+			if len(data) > 0 {
+				log.Trace("Handling: %d data, %v", len(data), data)
+				if unhandled := p.handle(data...); unhandled != nil {
+					log.Error("Unhandled Data in queue %d", p.qid)
+				}
+				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
+			}
+			_, resumed := p.IsPausedIsResumed()
+			select {
+			case <-resumed:
+				paused, _ = p.IsPausedIsResumed()
+				log.Trace("Worker for Queue %d Resuming", p.qid)
+				util.StopTimer(timer)
+			case <-ctx.Done():
+				log.Trace("Worker shutting down")
+				return
+			}
+		default:
+		}
+		select {
+		case <-paused:
+			// go back around
 		case <-ctx.Done():
 			if len(data) > 0 {
 				log.Trace("Handling: %d data, %v", len(data), data)
-				p.handle(data...)
+				if unhandled := p.handle(data...); unhandled != nil {
+					log.Error("Unhandled Data in queue %d", p.qid)
+				}
 				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
 			}
 			log.Trace("Worker shutting down")
@ -357,59 +479,36 @@ func (p *WorkerPool) doWork(ctx context.Context) {
 				// the dataChan has been closed - we should finish up:
 				if len(data) > 0 {
 					log.Trace("Handling: %d data, %v", len(data), data)
-					p.handle(data...)
+					if unhandled := p.handle(data...); unhandled != nil {
+						log.Error("Unhandled Data in queue %d", p.qid)
+					}
 					atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
 				}
 				log.Trace("Worker shutting down")
 				return
 			}
 			data = append(data, datum)
+			util.StopTimer(timer)
+
 			if len(data) >= p.batchLength {
 				log.Trace("Handling: %d data, %v", len(data), data)
-				p.handle(data...)
+				if unhandled := p.handle(data...); unhandled != nil {
+					log.Error("Unhandled Data in queue %d", p.qid)
+				}
 				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
 				data = make([]Data, 0, p.batchLength)
+			} else {
+				timer.Reset(delay)
 			}
-		default:
-			timer := time.NewTimer(delay)
-			select {
-			case <-ctx.Done():
-				util.StopTimer(timer)
-				if len(data) > 0 {
-					log.Trace("Handling: %d data, %v", len(data), data)
-					p.handle(data...)
-					atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
+		case <-timer.C:
+			delay = time.Millisecond * 100
+			if len(data) > 0 {
+				log.Trace("Handling: %d data, %v", len(data), data)
+				if unhandled := p.handle(data...); unhandled != nil {
+					log.Error("Unhandled Data in queue %d", p.qid)
 				}
-				log.Trace("Worker shutting down")
-				return
-			case datum, ok := <-p.dataChan:
-				util.StopTimer(timer)
-				if !ok {
-					// the dataChan has been closed - we should finish up:
-					if len(data) > 0 {
-						log.Trace("Handling: %d data, %v", len(data), data)
-						p.handle(data...)
-						atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
-					}
-					log.Trace("Worker shutting down")
-					return
-				}
-				data = append(data, datum)
-				if len(data) >= p.batchLength {
-					log.Trace("Handling: %d data, %v", len(data), data)
-					p.handle(data...)
-					atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
-					data = make([]Data, 0, p.batchLength)
-				}
-			case <-timer.C:
-				delay = time.Millisecond * 100
-				if len(data) > 0 {
-					log.Trace("Handling: %d data, %v", len(data), data)
-					p.handle(data...)
-					atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
-					data = make([]Data, 0, p.batchLength)
-				}
-
+				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
+				data = make([]Data, 0, p.batchLength)
 			}
 		}
 	}