jaegertracing · vprithvi · Oct 9, 2018 · Sep 25, 2018 · Sep 27, 2018 · Oct 1, 2018
diff --git a/cmd/ingester/app/consumer/consumer.go b/cmd/ingester/app/consumer/consumer.go
@@ -16,6 +16,7 @@ package consumer
 
 import (
 	"sync"
+	"time"
 
 	"github.com/Shopify/sarama"
 	sc "github.com/bsm/sarama-cluster"
@@ -42,6 +43,9 @@ type Consumer struct {
 	internalConsumer consumer.Consumer
 	processorFactory ProcessorFactory
 
+	deadlockDetectorFactory      deadlockDetectorFactory
+	allPartitionDeadlockDetector *deadlockDetector
+
 	partitionIDToState map[int32]*consumerState
 }
 
@@ -52,12 +56,15 @@ type consumerState struct {
 
 // New is a constructor for a Consumer
 func New(params Params) (*Consumer, error) {
+	deadlockDetectorFactory := newDeadlockDetectorFactory(params.Factory, params.Logger, time.Minute)
 	return &Consumer{
-		metricsFactory:     params.Factory,
-		logger:             params.Logger,
-		internalConsumer:   params.InternalConsumer,
-		processorFactory:   params.ProcessorFactory,
-		partitionIDToState: make(map[int32]*consumerState),
+		metricsFactory:               params.Factory,
+		logger:                       params.Logger,
+		internalConsumer:             params.InternalConsumer,
+		processorFactory:             params.ProcessorFactory,
+		deadlockDetectorFactory:      deadlockDetectorFactory,
+		allPartitionDeadlockDetector: deadlockDetectorFactory.startMonitoring(),
+		partitionIDToState:           make(map[int32]*consumerState),
 	}, nil
 }
 
@@ -73,6 +80,7 @@ func (c *Consumer) Start() {
 				// to the cleanup process not completing
 				p.wg.Wait()
 			}
+			c.newPartitionMetrics(pc.Partition()).startCounter.Inc(1)
 			c.partitionIDToState[pc.Partition()] = &consumerState{partitionConsumer: pc}
 			go c.handleMessages(pc)
 			go c.handleErrors(pc.Partition(), pc.Errors())
@@ -86,6 +94,7 @@ func (c *Consumer) Close() error {
 		c.closePartition(p.partitionConsumer)
 		p.wg.Wait()
 	}
+	c.allPartitionDeadlockDetector.close()
 	c.logger.Info("Closing parent consumer")
 	return c.internalConsumer.Close()
 }
@@ -97,27 +106,44 @@ func (c *Consumer) handleMessages(pc sc.PartitionConsumer) {
 	defer c.closePartition(pc)
 
 	msgMetrics := c.newMsgMetrics(pc.Partition())
+
 	var msgProcessor processor.SpanProcessor
 
-	for msg := range pc.Messages() {
-		c.logger.Debug("Got msg", zap.Any("msg", msg))
-		msgMetrics.counter.Inc(1)
-		msgMetrics.offsetGauge.Update(msg.Offset)
-		msgMetrics.lagGauge.Update(pc.HighWaterMarkOffset() - msg.Offset - 1)
+	deadlockDetector := c.deadlockDetectorFactory.startMonitoringForPartition(pc.Partition())
+	defer deadlockDetector.close()
 
-		if msgProcessor == nil {
-			msgProcessor = c.processorFactory.new(pc.Partition(), msg.Offset-1)
-			defer msgProcessor.Close()
-		}
+	for {
+		select {
+		case msg, ok := <-pc.Messages():
+			if !ok {
+				c.logger.Info("Message channel closed. ", zap.Int32("partition", pc.Partition()))
+				return
+			}
+			c.logger.Debug("Got msg", zap.Any("msg", msg))
+			msgMetrics.counter.Inc(1)
+			msgMetrics.offsetGauge.Update(msg.Offset)
+			msgMetrics.lagGauge.Update(pc.HighWaterMarkOffset() - msg.Offset - 1)
+			deadlockDetector.incrementMsgCount()
+			c.allPartitionDeadlockDetector.incrementMsgCount()
+
+			if msgProcessor == nil {
+				msgProcessor = c.processorFactory.new(pc.Partition(), msg.Offset-1)
+				defer msgProcessor.Close()
+			}
+
+			msgProcessor.Process(&saramaMessageWrapper{msg})
 
-		msgProcessor.Process(&saramaMessageWrapper{msg})
+		case <-deadlockDetector.getClosePartition():
+			c.logger.Info("Closing partition due to inactivity", zap.Int32("partition", pc.Partition()))
+			return
+		}
 	}
-	c.logger.Info("Finished handling messages", zap.Int32("partition", pc.Partition()))
 }
 
 func (c *Consumer) closePartition(partitionConsumer sc.PartitionConsumer) {
 	c.logger.Info("Closing partition consumer", zap.Int32("partition", partitionConsumer.Partition()))
 	partitionConsumer.Close() // blocks until messages channel is drained
+	c.newPartitionMetrics(partitionConsumer.Partition()).closeCounter.Inc(1)
 	c.logger.Info("Closed partition consumer", zap.Int32("partition", partitionConsumer.Partition()))
 }
 

diff --git a/cmd/ingester/app/consumer/consumer_metrics.go b/cmd/ingester/app/consumer/consumer_metrics.go
@@ -30,8 +30,17 @@ type errMetrics struct {
 	errCounter metrics.Counter
 }
 
+type partitionMetrics struct {
+	startCounter metrics.Counter
+	closeCounter metrics.Counter
+}
+
+func (c *Consumer) getNamespace(partition int32) metrics.Factory {
+	return c.metricsFactory.Namespace("sarama-consumer", map[string]string{"partition": strconv.Itoa(int(partition))})
+}
+
 func (c *Consumer) newMsgMetrics(partition int32) msgMetrics {
-	f := c.metricsFactory.Namespace("sarama-consumer", map[string]string{"partition": strconv.Itoa(int(partition))})
+	f := c.getNamespace(partition)
 	return msgMetrics{
 		counter:     f.Counter("messages", nil),
 		offsetGauge: f.Gauge("current-offset", nil),
@@ -40,7 +49,12 @@ func (c *Consumer) newMsgMetrics(partition int32) msgMetrics {
 }
 
 func (c *Consumer) newErrMetrics(partition int32) errMetrics {
-	f := c.metricsFactory.Namespace("sarama-consumer", map[string]string{"partition": strconv.Itoa(int(partition))})
-	return errMetrics{errCounter: f.Counter("errors", nil)}
+	return errMetrics{errCounter: c.getNamespace(partition).Counter("errors", nil)}
+}
 
+func (c *Consumer) newPartitionMetrics(partition int32) partitionMetrics {
+	f := c.getNamespace(partition)
+	return partitionMetrics{
+		closeCounter: f.Counter("partition-close", nil),
+		startCounter: f.Counter("partition-start", nil)}
 }
diff --git a/cmd/ingester/app/consumer/consumer_test.go b/cmd/ingester/app/consumer/consumer_test.go
@@ -89,11 +89,14 @@ func newConsumer(
 	consumer consumer.Consumer) *Consumer {
 
 	logger, _ := zap.NewDevelopment()
+	deadlockDetectorFactory := newDeadlockDetectorFactory(factory, logger, time.Second)
 	return &Consumer{
-		metricsFactory:     factory,
-		logger:             logger,
-		internalConsumer:   consumer,
-		partitionIDToState: make(map[int32]*consumerState),
+		metricsFactory:               factory,
+		logger:                       logger,
+		internalConsumer:             consumer,
+		partitionIDToState:           make(map[int32]*consumerState),
+		deadlockDetectorFactory:      deadlockDetectorFactory,
+		allPartitionDeadlockDetector: deadlockDetectorFactory.startMonitoringForPartition(-1),
 
 		processorFactory: ProcessorFactory{
 			topic:          topic,
@@ -173,6 +176,11 @@ func TestSaramaConsumerWrapper_start_Messages(t *testing.T) {
 		Tags:  partitionTag,
 		Value: 0,
 	})
+	testutils.AssertCounterMetrics(t, localFactory, testutils.ExpectedMetric{
+		Name:  "sarama-consumer.partition-start",
+		Tags:  partitionTag,
+		Value: 1,
+	})
 }
 
 func TestSaramaConsumerWrapper_start_Errors(t *testing.T) {
@@ -210,3 +218,28 @@ func TestSaramaConsumerWrapper_start_Errors(t *testing.T) {
 
 	t.Fail()
 }
+
+func TestHandleClosePartition(t *testing.T) {
+	localFactory := metrics.NewLocalFactory(0)
+
+	mp := &pmocks.SpanProcessor{}
+	saramaConsumer := smocks.NewConsumer(t, &sarama.Config{})
+	mc := saramaConsumer.ExpectConsumePartition(topic, partition, msgOffset)
+	mc.ExpectErrorsDrainedOnClose()
+	saramaPartitionConsumer, e := saramaConsumer.ConsumePartition(topic, partition, msgOffset)
+	require.NoError(t, e)
+
+	undertest := newConsumer(localFactory, topic, mp, newSaramaClusterConsumer(saramaPartitionConsumer))
+	undertest.deadlockDetectorFactory = newDeadlockDetectorFactory(localFactory, zap.NewNop(), 10*time.Millisecond)
+	undertest.Start()
+	defer undertest.Close()
+
+	for i := 0; i < 10; i++ {
+		time.Sleep(20 * time.Millisecond)
+		c, _ := localFactory.Snapshot()
+		if c["sarama-consumer.partition-close|partition=316"] == 1 {
+			return
+		}
+	}
+	assert.Fail(t, "Did not close partition")
+}
diff --git a/cmd/ingester/app/consumer/deadlock_detector.go b/cmd/ingester/app/consumer/deadlock_detector.go
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 The Jaeger Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consumer
+
+import (
+	"runtime"
+	"strconv"
+	"sync/atomic"
+	"time"
+
+	"github.com/uber/jaeger-lib/metrics"
+	"go.uber.org/zap"
+)
+
+// deadlockDetectorFactory is a factory for deadlockDetectors
+type deadlockDetectorFactory struct {
+	metricsFactory metrics.Factory
+	logger         *zap.Logger
+	interval       time.Duration
+	panicFunc      func(int32)
+}
+
+type deadlockDetector struct {
+	msgConsumed    *uint64
+	ticker         *time.Ticker
+	logger         *zap.Logger
+	closePartition chan struct{}
+	done           chan struct{}
+}
+
+func newDeadlockDetectorFactory(factory metrics.Factory, logger *zap.Logger, interval time.Duration) deadlockDetectorFactory {
+	return deadlockDetectorFactory{
+		metricsFactory: factory,
+		logger:         logger,
+		interval:       interval,
+		panicFunc: func(partition int32) {
+			factory.Counter("deadlockdetector.panic-issued", map[string]string{"partition": strconv.Itoa(int(partition))}).Inc(1)
+			time.Sleep(time.Second) // Allow time to flush metric
+
+			buf := make([]byte, 1<<20)
+			logger.Panic("No messages processed in the last check interval",
+				zap.Int32("partition", partition),
+				zap.String("stack", string(buf[:runtime.Stack(buf, true)])))
+		},
+	}
+}
+
+// startMonitoringForPartition monitors the messages consumed by the partition and signals for the partition to by
+// closed by sending a message on the closePartition channel.
+//
+// Closing the partition should result in a rebalance, which alleviates the condition. This means that rebalances can
+// happen frequently if there is no traffic on the Kafka topic. This shouldn't affect normal operations.
+//
+// If the message send isn't processed within the next check interval, a panic is issued.This hack relies on a
+// container management system (k8s, aurora, marathon, etc) to reschedule
+// the dead instance.
+//
+// This hack protects jaeger-ingester from issues described in  https://github.com/jaegertracing/jaeger/issues/1052
+//
+func (s *deadlockDetectorFactory) startMonitoringForPartition(partition int32) *deadlockDetector {
+	var msgConsumed uint64
+	w := &deadlockDetector{
+		msgConsumed:    &msgConsumed,
+		ticker:         time.NewTicker(s.interval),
+		closePartition: make(chan struct{}, 1),
+		done:           make(chan struct{}),
+		logger:         s.logger,
+	}
+
+	go func() {
+		for range w.ticker.C {
+			select {
+			case <-w.done:
+				s.logger.Info("Closing ticker routine", zap.Int32("partition", partition))
+				return
+			default:
+				if atomic.LoadUint64(w.msgConsumed) == 0 {
+					select {
+					case w.closePartition <- struct{}{}:
+						s.logger.Warn("Signalling partition close due to inactivity", zap.Int32("partition", partition))
+					default:
+						// If closePartition is blocked, the consumer might have deadlocked - kill the process
+						s.panicFunc(partition)
+					}
+				} else {
+					atomic.StoreUint64(w.msgConsumed, 0)
+				}
+			}
+		}
+	}()
+
+	return w
+}
+
+// startMonitoring is to monitor that the sum of messages consumed across all partitions is non zero for the given interval
+// If it is zero when there are producers producing messages on the topic, it means that sarama-cluster hasn't
+// retrieved partition assignments. (This case will not be caught by startMonitoringForPartition because no partitions
+// were retrieved).
+func (s *deadlockDetectorFactory) startMonitoring() *deadlockDetector {
+	return s.startMonitoringForPartition(-1)
+}
+
+func (w *deadlockDetector) getClosePartition() chan struct{} {
+	return w.closePartition
+}
+
+func (w *deadlockDetector) close() {
+	w.logger.Info("Closing deadlock detector")
+	w.done <- struct{}{}
+	w.ticker.Stop()
+}
+
+func (w *deadlockDetector) incrementMsgCount() {
+	atomic.AddUint64(w.msgConsumed, 1)
+}