open-telemetry
diff --git a/‎.chloggen/batch-logs-in-file-consumer-2.yaml
+27 b/‎.chloggen/batch-logs-in-file-consumer-2.yaml
+27
diff --git a/‎.chloggen/batch-logs-in-file-consumer.yaml
+27 b/‎.chloggen/batch-logs-in-file-consumer.yaml
+27
diff --git a/‎pkg/stanza/adapter/mocks_test.go
+4 b/‎pkg/stanza/adapter/mocks_test.go
+4
diff --git a/‎pkg/stanza/fileconsumer/benchmark_test.go
+2-3 b/‎pkg/stanza/fileconsumer/benchmark_test.go
+2-3
diff --git a/‎pkg/stanza/fileconsumer/emit/emit.go
+1-1 b/‎pkg/stanza/fileconsumer/emit/emit.go
+1-1
diff --git a/‎pkg/stanza/fileconsumer/file_test.go
-75 b/‎pkg/stanza/fileconsumer/file_test.go
-75
diff --git a/‎pkg/stanza/fileconsumer/internal/emittest/nop.go
+1-3 b/‎pkg/stanza/fileconsumer/internal/emittest/nop.go
+1-3
diff --git a/‎pkg/stanza/fileconsumer/internal/emittest/nop_test.go
+1-3 b/‎pkg/stanza/fileconsumer/internal/emittest/nop_test.go
+1-3
diff --git a/‎pkg/stanza/fileconsumer/internal/emittest/sink.go
+7-5 b/‎pkg/stanza/fileconsumer/internal/emittest/sink.go
+7-5
diff --git a/‎pkg/stanza/fileconsumer/internal/emittest/sink_test.go
+1-1 b/‎pkg/stanza/fileconsumer/internal/emittest/sink_test.go
+1-1
diff --git a/‎pkg/stanza/fileconsumer/internal/header/output.go
+7 b/‎pkg/stanza/fileconsumer/internal/header/output.go
+7
diff --git a/‎pkg/stanza/fileconsumer/internal/reader/factory.go
+4-2 b/‎pkg/stanza/fileconsumer/internal/reader/factory.go
+4-2
diff --git a/‎pkg/stanza/fileconsumer/internal/reader/reader.go
+21-8 b/‎pkg/stanza/fileconsumer/internal/reader/reader.go
+21-8
diff --git a/‎pkg/stanza/fileconsumer/internal/reader/reader_test.go
+2-4 b/‎pkg/stanza/fileconsumer/internal/reader/reader_test.go
+2-4
diff --git a/‎pkg/stanza/operator/helper/emitter.go
+25 b/‎pkg/stanza/operator/helper/emitter.go
+25
diff --git a/‎pkg/stanza/operator/helper/input.go
+9 b/‎pkg/stanza/operator/helper/input.go
+9
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add method `ProcessBatch` to `Operator` interface in `pkg/stanza/operator` package to support batch processing.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [35455]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [api]
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Change signature of `emit.Callback` function in `pkg/stanza/fileconsumer/emit` package to emit multiple tokens.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [35455]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [api]
@@ -50,6 +50,10 @@ func (o *UnstartableOperator) Start(_ operator.Persister) error {
 	return errors.New("something very unusual happened")
 }
 
+func (o *UnstartableOperator) ProcessBatch(_ context.Context, _ []*entry.Entry) error {
+	return nil
+}
+
 // Process will return nil
 func (o *UnstartableOperator) Process(_ context.Context, _ *entry.Entry) error {
 	return nil
 
@@ -16,7 +16,6 @@ import (
 	"github.com/stretchr/testify/require"
 	"go.opentelemetry.io/collector/component/componenttest"
 
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/internal/filetest"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil"
@@ -188,8 +187,8 @@ func BenchmarkFileInput(b *testing.B) {
 			cfg.PollInterval = time.Microsecond
 
 			doneChan := make(chan bool, len(files))
-			callback := func(_ context.Context, token emit.Token) error {
-				if len(token.Body) == 0 {
+			callback := func(_ context.Context, tokens [][]byte, _ map[string]any, _ int64) error {
+				if len(tokens) > 0 && len(tokens[len(tokens)-1]) == 0 {
 					doneChan <- true
 				}
 				return nil
 
@@ -7,7 +7,7 @@ import (
 	"context"
 )
 
-type Callback func(ctx context.Context, token Token) error
+type Callback func(ctx context.Context, tokens [][]byte, attributes map[string]any, lastRecordNumber int64) error
 
 type Token struct {
 	Body       []byte
 
@@ -1580,78 +1580,3 @@ func TestReadGzipCompressedLogsFromEnd(t *testing.T) {
 	operator.poll(context.TODO())
 	sink.ExpectToken(t, []byte("testlog4"))
 }
-
-func TestIncludeFileRecordNumber(t *testing.T) {
-	t.Parallel()
-
-	tempDir := t.TempDir()
-	cfg := NewConfig().includeDir(tempDir)
-	cfg.StartAt = "beginning"
-	cfg.IncludeFileRecordNumber = true
-	operator, sink := testManager(t, cfg)
-
-	// Create a file, then start
-	temp := filetest.OpenTemp(t, tempDir)
-	filetest.WriteString(t, temp, "testlog1\n")
-
-	require.NoError(t, operator.Start(testutil.NewUnscopedMockPersister()))
-	defer func() {
-		require.NoError(t, operator.Stop())
-	}()
-
-	sink.ExpectCall(t, []byte("testlog1"), map[string]any{
-		attrs.LogFileName:         filepath.Base(temp.Name()),
-		attrs.LogFileRecordNumber: int64(1),
-	})
-}
-
-func TestIncludeFileRecordNumberWithHeaderConfigured(t *testing.T) {
-	t.Parallel()
-
-	tempDir := t.TempDir()
-	cfg := NewConfig().includeDir(tempDir)
-	cfg.StartAt = "beginning"
-	cfg.IncludeFileRecordNumber = true
-	cfg = cfg.withHeader("^#", "(?P<header_attr>[A-z]+)")
-	operator, sink := testManager(t, cfg)
-
-	// Create a file, then start
-	temp := filetest.OpenTemp(t, tempDir)
-	filetest.WriteString(t, temp, "#abc\n#xyz: headerValue2\ntestlog1\n")
-
-	require.NoError(t, operator.Start(testutil.NewUnscopedMockPersister()))
-	defer func() {
-		require.NoError(t, operator.Stop())
-	}()
-
-	sink.ExpectCall(t, []byte("testlog1"), map[string]any{
-		attrs.LogFileName:         filepath.Base(temp.Name()),
-		attrs.LogFileRecordNumber: int64(1),
-		"header_attr":             "xyz",
-	})
-}
-
-func TestIncludeFileRecordNumberWithHeaderConfiguredButMissing(t *testing.T) {
-	t.Parallel()
-
-	tempDir := t.TempDir()
-	cfg := NewConfig().includeDir(tempDir)
-	cfg.StartAt = "beginning"
-	cfg.IncludeFileRecordNumber = true
-	cfg = cfg.withHeader("^#", "(?P<header_key>[A-z]+): (?P<header_value>[A-z]+)")
-	operator, sink := testManager(t, cfg)
-
-	// Create a file, then start
-	temp := filetest.OpenTemp(t, tempDir)
-	filetest.WriteString(t, temp, "testlog1\n")
-
-	require.NoError(t, operator.Start(testutil.NewUnscopedMockPersister()))
-	defer func() {
-		require.NoError(t, operator.Stop())
-	}()
-
-	sink.ExpectCall(t, []byte("testlog1"), map[string]any{
-		attrs.LogFileName:         filepath.Base(temp.Name()),
-		attrs.LogFileRecordNumber: int64(1),
-	})
-}
@@ -5,10 +5,8 @@ package emittest // import "github.com/open-telemetry/opentelemetry-collector-co
 
 import (
 	"context"
-
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"
 )
 
-func Nop(_ context.Context, _ emit.Token) error {
+func Nop(_ context.Context, _ [][]byte, _ map[string]any, _ int64) error {
 	return nil
 }
@@ -8,10 +8,8 @@ import (
 	"testing"
 
 	"github.com/stretchr/testify/require"
-
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"
 )
 
 func TestNop(t *testing.T) {
-	require.NoError(t, Nop(context.Background(), emit.Token{}))
+	require.NoError(t, Nop(context.Background(), [][]byte{}, map[string]any{}, int64(0)))
 }
@@ -52,11 +52,13 @@ func NewSink(opts ...SinkOpt) *Sink {
 	return &Sink{
 		emitChan: emitChan,
 		timeout:  cfg.timeout,
-		Callback: func(ctx context.Context, token emit.Token) error {
-			select {
-			case <-ctx.Done():
-				return ctx.Err()
-			case emitChan <- token:
+		Callback: func(ctx context.Context, tokens [][]byte, attributes map[string]any, _ int64) error {
+			for _, token := range tokens {
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				case emitChan <- emit.NewToken(token, attributes):
+				}
 			}
 			return nil
 		},
 
@@ -204,7 +204,7 @@ func sinkTest(t *testing.T, opts ...SinkOpt) (*Sink, []emit.Token) {
 	}
 	go func() {
 		for _, c := range testCalls {
-			assert.NoError(t, s.Callback(context.Background(), emit.NewToken(c.Body, c.Attributes)))
+			assert.NoError(t, s.Callback(context.Background(), [][]byte{c.Body}, c.Attributes, 0))
 		}
 	}()
 	return s, testCalls
 
@@ -30,6 +30,13 @@ func newPipelineOutput(set component.TelemetrySettings) *pipelineOutput {
 	}
 }
 
+func (e *pipelineOutput) ProcessBatch(ctx context.Context, entries []*entry.Entry) error {
+	for i := range entries {
+		_ = e.Process(ctx, entries[i])
+	}
+	return nil
+}
+
 // Drop the entry if logChan is full, in order to avoid this operator blocking.
 // This protects against a case where an operator could return an error, but continue propagating a log entry,
 // leaving an unexpected entry in the output channel.
 
@@ -24,8 +24,9 @@ import (
 )
 
 const (
-	DefaultMaxLogSize  = 1024 * 1024
-	DefaultFlushPeriod = 500 * time.Millisecond
+	DefaultMaxLogSize   = 1024 * 1024
+	DefaultFlushPeriod  = 500 * time.Millisecond
+	DefaultMaxBatchSize = 100
 )
 
 type Factory struct {
@@ -81,6 +82,7 @@ func (f *Factory) NewReaderFromMetadata(file *os.File, m *Metadata) (r *Reader,
 		includeFileRecordNum: f.IncludeFileRecordNumber,
 		compression:          f.Compression,
 		acquireFSLock:        f.AcquireFSLock,
+		maxBatchSize:         DefaultMaxBatchSize,
 		emitFunc:             f.EmitFunc,
 	}
 	r.set.Logger = r.set.Logger.With(zap.String("path", r.fileName))
 
@@ -16,7 +16,6 @@ import (
 	"golang.org/x/text/encoding"
 
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/textutils"
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/attrs"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/fingerprint"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/header"
@@ -55,6 +54,7 @@ type Reader struct {
 	includeFileRecordNum   bool
 	compression            string
 	acquireFSLock          bool
+	maxBatchSize           int
 }
 
 // ReadToEnd will read until the end of the file
@@ -188,6 +188,8 @@ func (r *Reader) readContents(ctx context.Context) {
 
 	s := scanner.New(r, r.maxLogSize, bufferSize, r.Offset, r.contentSplitFunc)
 
+	tokenBodies := make([][]byte, r.maxBatchSize)
+	numTokensBatched := 0
 	// Iterate over the contents of the file.
 	for {
 		select {
@@ -203,27 +205,38 @@ func (r *Reader) readContents(ctx context.Context) {
 			} else if r.deleteAtEOF {
 				r.delete()
 			}
+
+			if numTokensBatched > 0 {
+				err := r.emitFunc(ctx, tokenBodies[:numTokensBatched], r.FileAttributes, r.RecordNum)
+				if err != nil {
+					r.set.Logger.Error("failed to emit token", zap.Error(err))
+				}
+				r.Offset = s.Pos()
+			}
 			return
 		}
 
-		token, err := r.decoder.Bytes(s.Bytes())
+		var err error
+		tokenBodies[numTokensBatched], err = r.decoder.Bytes(s.Bytes())
 		if err != nil {
 			r.set.Logger.Error("failed to decode token", zap.Error(err))
 			r.Offset = s.Pos() // move past the bad token or we may be stuck
 			continue
 		}
+		numTokensBatched++
 
 		if r.includeFileRecordNum {
 			r.RecordNum++
-			r.FileAttributes[attrs.LogFileRecordNumber] = r.RecordNum
 		}
 
-		err = r.emitFunc(ctx, emit.NewToken(token, r.FileAttributes))
-		if err != nil {
-			r.set.Logger.Error("failed to process token", zap.Error(err))
+		if r.maxBatchSize > 0 && numTokensBatched >= r.maxBatchSize {
+			err := r.emitFunc(ctx, tokenBodies[:numTokensBatched], r.FileAttributes, r.RecordNum)
+			if err != nil {
+				r.set.Logger.Error("failed to emit token", zap.Error(err))
+			}
+			numTokensBatched = 0
+			r.Offset = s.Pos()
 		}
-
-		r.Offset = s.Pos()
 	}
 }
 
 
@@ -347,10 +347,8 @@ func BenchmarkFileRead(b *testing.B) {
 
 	// Use a long flush period to ensure it does not expire DURING a ReadToEnd
 	counter := atomic.Int64{}
-	f := newTestFactory(b, func(_ context.Context, token emit.Token) error {
-		if len(token.Body) != 0 {
-			counter.Add(1)
-		}
+	f := newTestFactory(b, func(_ context.Context, tokens [][]byte, _ map[string]any, _ int64) error {
+		counter.Add(int64(len(tokens)))
 		return nil
 	})
 	b.ReportAllocs()
 
@@ -99,6 +99,31 @@ func (e *LogEmitter) Stop() error {
 	return nil
 }
 
+// ProcessBatch emits the entries to the consumerFunc
+func (e *LogEmitter) ProcessBatch(ctx context.Context, entries []*entry.Entry) error {
+	if oldBatch := e.appendEntries(entries); len(oldBatch) > 0 {
+		e.consumerFunc(ctx, oldBatch)
+	}
+
+	return nil
+}
+
+// appendEntries appends the entry to the current batch. If maxBatchSize is reached, a new batch will be made, and the old batch
+// (which should be flushed) will be returned
+func (e *LogEmitter) appendEntries(entries []*entry.Entry) []*entry.Entry {
+	e.batchMux.Lock()
+	defer e.batchMux.Unlock()
+
+	e.batch = append(e.batch, entries...)
+	if uint(len(e.batch)) >= e.maxBatchSize {
+		var oldBatch []*entry.Entry
+		oldBatch, e.batch = e.batch, make([]*entry.Entry, 0, e.maxBatchSize)
+		return oldBatch
+	}
+
+	return nil
+}
+
 // Process will emit an entry to the output channel
 func (e *LogEmitter) Process(ctx context.Context, ent *entry.Entry) error {
 	if oldBatch := e.appendEntry(ent); len(oldBatch) > 0 {
 
@@ -82,6 +82,15 @@ func (i *InputOperator) CanProcess() bool {
 	return false
 }
 
+// ProcessBatch will always return an error if called.
+func (i *InputOperator) ProcessBatch(_ context.Context, _ []*entry.Entry) error {
+	i.Logger().Error("Operator received a batch of entries, but can not process")
+	return errors.NewError(
+		"Operator can not process logs.",
+		"Ensure that operator is not configured to receive logs from other operators",
+	)
+}
+
 // Process will always return an error if called.
 func (i *InputOperator) Process(_ context.Context, _ *entry.Entry) error {
 	i.Logger().Error("Operator received an entry, but can not process")
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ import (`
`7`	`7`	`"context"`
`8`	`8`	`)`
`9`	`9`
`10`		`-type Callback func(ctx context.Context, token Token) error`
	`10`	`+type Callback func(ctx context.Context, tokens [][]byte, attributes map[string]any, lastRecordNumber int64) error`
`11`	`11`
`12`	`12`	`type Token struct {`
`13`	`13`	`Body []byte`
Original file line number	Diff line number	Diff line change
`@@ -5,10 +5,8 @@ package emittest // import "github.com/open-telemetry/opentelemetry-collector-co`
`5`	`5`
`6`	`6`	`import (`
`7`	`7`	`"context"`
`8`		`-`
`9`		`- "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"`
`10`	`8`	`)`
`11`	`9`
`12`		`-func Nop(_ context.Context, _ emit.Token) error {`
	`10`	`+func Nop(_ context.Context, _ [][]byte, _ map[string]any, _ int64) error {`
`13`	`11`	`return nil`
`14`	`12`	`}`
Original file line number	Diff line number	Diff line change
`@@ -8,10 +8,8 @@ import (`
`8`	`8`	`"testing"`
`9`	`9`
`10`	`10`	`"github.com/stretchr/testify/require"`
`11`		`-`
`12`		`- "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/emit"`
`13`	`11`	`)`
`14`	`12`
`15`	`13`	`func TestNop(t *testing.T) {`
`16`		`- require.NoError(t, Nop(context.Background(), emit.Token{}))`
	`14`	`+ require.NoError(t, Nop(context.Background(), [][]byte{}, map[string]any{}, int64(0)))`
`17`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -204,7 +204,7 @@ func sinkTest(t testing.T, opts ...SinkOpt) (Sink, []emit.Token) {`
`204`	`204`	`}`
`205`	`205`	`go func() {`
`206`	`206`	`for _, c := range testCalls {`
`207`		`- assert.NoError(t, s.Callback(context.Background(), emit.NewToken(c.Body, c.Attributes)))`
	`207`	`+ assert.NoError(t, s.Callback(context.Background(), [][]byte{c.Body}, c.Attributes, 0))`
`208`	`208`	`}`
`209`	`209`	`}()`
`210`	`210`	`return s, testCalls`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,13 @@ func newPipelineOutput(set component.TelemetrySettings) *pipelineOutput {`
`30`	`30`	`}`
`31`	`31`	`}`
`32`	`32`
	`33`	`+func (e pipelineOutput) ProcessBatch(ctx context.Context, entries []entry.Entry) error {`
	`34`	`+ for i := range entries {`
	`35`	`+ _ = e.Process(ctx, entries[i])`
	`36`	`+ }`
	`37`	`+ return nil`
	`38`	`+}`
	`39`	`+`
`33`	`40`	`// Drop the entry if logChan is full, in order to avoid this operator blocking.`
`34`	`41`	`// This protects against a case where an operator could return an error, but continue propagating a log entry,`
`35`	`42`	`// leaving an unexpected entry in the output channel.`
Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,9 @@ import (`
`24`	`24`	`)`
`25`	`25`
`26`	`26`	`const (`
`27`		`- DefaultMaxLogSize = 1024 * 1024`
`28`		`- DefaultFlushPeriod = 500 * time.Millisecond`
	`27`	`+ DefaultMaxLogSize = 1024 * 1024`
	`28`	`+ DefaultFlushPeriod = 500 * time.Millisecond`
	`29`	`+ DefaultMaxBatchSize = 100`
`29`	`30`	`)`
`30`	`31`
`31`	`32`	`type Factory struct {`
`@@ -81,6 +82,7 @@ func (f Factory) NewReaderFromMetadata(file os.File, m Metadata) (r Reader,`
`81`	`82`	`includeFileRecordNum: f.IncludeFileRecordNumber,`
`82`	`83`	`compression: f.Compression,`
`83`	`84`	`acquireFSLock: f.AcquireFSLock,`
	`85`	`+ maxBatchSize: DefaultMaxBatchSize,`
`84`	`86`	`emitFunc: f.EmitFunc,`
`85`	`87`	`}`
`86`	`88`	`r.set.Logger = r.set.Logger.With(zap.String("path", r.fileName))`