From 02557d07172168798cb9314ac8e6daa153eb0983 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 12 Dec 2021 23:07:16 -0500 Subject: [PATCH 01/28] todo --- todo.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/todo.txt b/todo.txt index f040fc4929..9aee947f10 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,8 @@ ================================================================ PUNCHDOWN LIST +* --ifs-regex & --ips-regex -- guessing is not safe as evidence by '.' and '|' + * perf: o go tool pprof -http=:8080 cpu.pprof x close(chan) as EOS throughout From 03cd9e0e4e0bfceea70317e5e43d2661768a4fb7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 7 Dec 2021 07:56:38 -0500 Subject: [PATCH 02/28] Rename inputChannel,outputChannel to readerChannel,writerChannel --- todo.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/todo.txt b/todo.txt index f040fc4929..c9b57d1af9 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,7 @@ ================================================================ PUNCHDOWN LIST +* perf wup @ rgp.md * perf: o go tool pprof -http=:8080 cpu.pprof x close(chan) as EOS throughout From 8abd334145df813b12c024935bab8a5e543e1141 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 7 Dec 2021 08:01:16 -0500 Subject: [PATCH 03/28] Rename inputChannel,outputChannel to readerChannel,writerChannel (#772) --- todo.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/todo.txt b/todo.txt index c9b57d1af9..7ec9ba8d1e 100644 --- a/todo.txt +++ b/todo.txt @@ -70,6 +70,10 @@ PUNCHDOWN LIST d how to handle tail -f and repl > try unconditional batching and hiding *first* to see how much best-case perf to be had ? lazy type-infer?? needs careful use of accessor-mutators in place of mv.type etc +<<<<<<< HEAD +======= +* note somewhere why NewEndOfStreamMarker instead of channel close -- readers carry final context +>>>>>>> f2e709408 (Rename inputChannel,outputChannel to readerChannel,writerChannel (#772)) * blockers: - keep checking issues From 2f8d2b18f91d0d303a31bbd2753ef6842df9ebd1 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 7 Dec 2021 08:49:28 -0500 Subject: [PATCH 04/28] Start batched-reader API mods --- internal/pkg/input/record_reader_nidx.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go index 7fecbb5741..9ecf71a48f 100644 --- a/internal/pkg/input/record_reader_nidx.go +++ b/internal/pkg/input/record_reader_nidx.go @@ -87,7 +87,6 @@ func (reader *RecordReaderNIDX) processHandle( break } - // TODO: IRS line := scanner.Text() // Check for comments-in-data feature From f2879aebde70ec56ab6fe453092f94d36f2cbb18 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 15:50:07 -0500 Subject: [PATCH 05/28] Singleton-list step for reader-batching at input --- internal/pkg/auxents/repl/session.go | 2 +- internal/pkg/auxents/repl/types.go | 4 +-- internal/pkg/auxents/repl/verbs.go | 33 +++++++++++------ internal/pkg/cli/option_types.go | 6 ++++ internal/pkg/input/pseudo_reader_gen.go | 20 +++++++---- internal/pkg/input/record_reader.go | 3 +- internal/pkg/input/record_reader_csv.go | 24 ++++++++----- internal/pkg/input/record_reader_csvlite.go | 36 ++++++++++++------- internal/pkg/input/record_reader_dkvp.go | 29 +++++++++------ internal/pkg/input/record_reader_factory.go | 18 +++++----- internal/pkg/input/record_reader_json.go | 30 +++++++++------- internal/pkg/input/record_reader_nidx.go | 22 +++++++----- internal/pkg/input/record_reader_xtab.go | 23 +++++++----- internal/pkg/stream/stream.go | 35 +++++++++++++++--- internal/pkg/transformers/join.go | 11 ++++-- .../transformers/utils/join-bucket-keeper.go | 11 +++--- internal/pkg/types/context.go | 29 +++++++++++++++ todo.txt | 18 ++++------ 18 files changed, 239 insertions(+), 115 deletions(-) diff --git a/internal/pkg/auxents/repl/session.go b/internal/pkg/auxents/repl/session.go index 1c4d1a019c..04dfd30150 100644 --- a/internal/pkg/auxents/repl/session.go +++ b/internal/pkg/auxents/repl/session.go @@ -47,7 +47,7 @@ func NewRepl( recordOutputStream *os.File, ) (*Repl, error) { - recordReader, err := input.Create(&options.ReaderOptions) + recordReader, err := input.Create(&options.ReaderOptions, 1) // recordsPerBatch if err != nil { return nil, err } diff --git a/internal/pkg/auxents/repl/types.go b/internal/pkg/auxents/repl/types.go index f33e45a3f6..ea20658a67 100644 --- a/internal/pkg/auxents/repl/types.go +++ b/internal/pkg/auxents/repl/types.go @@ -6,6 +6,7 @@ package repl import ( "bufio" + "container/list" "os" "github.com/johnkerl/miller/internal/pkg/cli" @@ -13,7 +14,6 @@ import ( "github.com/johnkerl/miller/internal/pkg/input" "github.com/johnkerl/miller/internal/pkg/output" "github.com/johnkerl/miller/internal/pkg/runtime" - "github.com/johnkerl/miller/internal/pkg/types" ) // ================================================================ @@ -46,7 +46,7 @@ type Repl struct { options *cli.TOptions - readerChannel chan *types.RecordAndContext + readerChannel chan *list.List // list of *types.RecordAndContext errorChannel chan error downstreamDoneChannel chan bool recordReader input.IRecordReader diff --git a/internal/pkg/auxents/repl/verbs.go b/internal/pkg/auxents/repl/verbs.go index 9b4f29de52..5197dc8b62 100644 --- a/internal/pkg/auxents/repl/verbs.go +++ b/internal/pkg/auxents/repl/verbs.go @@ -5,6 +5,7 @@ package repl import ( + "container/list" "fmt" "os" "strings" @@ -218,7 +219,7 @@ func (repl *Repl) openFiles(filenames []string) { // Remember for :reopen repl.options.FileNames = filenames - repl.readerChannel = make(chan *types.RecordAndContext, 10) + repl.readerChannel = make(chan *list.List, 2) // list of *types.RecordAndContext repl.errorChannel = make(chan error, 1) repl.downstreamDoneChannel = make(chan bool, 1) @@ -270,11 +271,11 @@ func handleRead(repl *Repl, args []string) bool { return true } - var recordAndContext *types.RecordAndContext = nil + var recordsAndContexts *list.List // list of *types.RecordAndContext var err error = nil select { - case recordAndContext = <-repl.readerChannel: + case recordsAndContexts = <-repl.readerChannel: break case err = <-repl.errorChannel: break @@ -287,7 +288,11 @@ func handleRead(repl *Repl, args []string) bool { return true } - if recordAndContext != nil { + if recordsAndContexts != nil { + // TODO: comment and make very clear we've set this all up to batch by 1 for the REPL + lib.InternalCodingErrorIf(recordsAndContexts.Len() != 1) + recordAndContext := recordsAndContexts.Front().Value.(*types.RecordAndContext) + skipOrProcessRecord( repl, recordAndContext, @@ -414,12 +419,12 @@ func handleProcess(repl *Repl, args []string) bool { // ---------------------------------------------------------------- func handleSkipOrProcessN(repl *Repl, n int, processingNotSkipping bool) { - var recordAndContext *types.RecordAndContext = nil + var recordsAndContexts *list.List // list of *types.RecordAndContext var err error = nil for i := 1; i <= n; i++ { select { - case recordAndContext = <-repl.readerChannel: + case recordsAndContexts = <-repl.readerChannel: break case err = <-repl.errorChannel: break @@ -434,7 +439,11 @@ func handleSkipOrProcessN(repl *Repl, n int, processingNotSkipping bool) { return } - if recordAndContext != nil { + if recordsAndContexts != nil { + // TODO: comment and make very clear we've set this all up to batch by 1 for the REPL + lib.InternalCodingErrorIf(recordsAndContexts.Len() != 1) + recordAndContext := recordsAndContexts.Front().Value.(*types.RecordAndContext) + shouldBreak := skipOrProcessRecord( repl, recordAndContext, @@ -472,12 +481,12 @@ func handleSkipOrProcessUntil(repl *Repl, dslString string, processingNotSkippin return } - var recordAndContext *types.RecordAndContext = nil + var recordsAndContexts *list.List // list of *types.RecordAndContext for { doubleBreak := false select { - case recordAndContext = <-repl.readerChannel: + case recordsAndContexts = <-repl.readerChannel: break case err = <-repl.errorChannel: break @@ -496,7 +505,11 @@ func handleSkipOrProcessUntil(repl *Repl, dslString string, processingNotSkippin return } - if recordAndContext != nil { + if recordsAndContexts != nil { + // TODO: comment and make very clear we've set this all up to batch by 1 for the REPL + lib.InternalCodingErrorIf(recordsAndContexts.Len() != 1) + recordAndContext := recordsAndContexts.Front().Value.(*types.RecordAndContext) + shouldBreak := skipOrProcessRecord( repl, recordAndContext, diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index d9063cf567..4529a2ac2a 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -72,6 +72,9 @@ type TReaderOptions struct { PrepipeIsRaw bool // For in-process gunzip/bunzip2/zcat (distinct from prepipe) FileInputEncoding lib.TFileInputEncoding + + // TODO: comment + RecordsPerBatch int } // ---------------------------------------------------------------- @@ -181,6 +184,9 @@ func DefaultReaderOptions() TReaderOptions { StepAsString: DEFAULT_GEN_STEP_AS_STRING, StopAsString: DEFAULT_GEN_STOP_AS_STRING, }, + + // TODO: make a cli option + RecordsPerBatch: 500, } } diff --git a/internal/pkg/input/pseudo_reader_gen.go b/internal/pkg/input/pseudo_reader_gen.go index e3af2a2e02..e7150df841 100644 --- a/internal/pkg/input/pseudo_reader_gen.go +++ b/internal/pkg/input/pseudo_reader_gen.go @@ -1,6 +1,7 @@ package input import ( + "container/list" "errors" "fmt" @@ -9,29 +10,34 @@ import ( ) type PseudoReaderGen struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int } -func NewPseudoReaderGen(readerOptions *cli.TReaderOptions) (*PseudoReaderGen, error) { +func NewPseudoReaderGen( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*PseudoReaderGen, error) { return &PseudoReaderGen{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } func (reader *PseudoReaderGen) Read( filenames []string, // ignored context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { reader.process(&context, readerChannel, errorChannel, downstreamDoneChannel) - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } func (reader *PseudoReaderGen) process( context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -91,7 +97,7 @@ func (reader *PseudoReaderGen) process( record.PutCopy(key, value) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) diff --git a/internal/pkg/input/record_reader.go b/internal/pkg/input/record_reader.go index c6f63f0242..f0a58c4086 100644 --- a/internal/pkg/input/record_reader.go +++ b/internal/pkg/input/record_reader.go @@ -2,6 +2,7 @@ package input import ( "bufio" + "container/list" "io" "github.com/johnkerl/miller/internal/pkg/types" @@ -19,7 +20,7 @@ type IRecordReader interface { Read( filenames []string, initialContext types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index e4de0081f3..b4c46c17a6 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -2,6 +2,7 @@ package input import ( "bytes" + "container/list" "encoding/csv" "errors" "fmt" @@ -17,11 +18,15 @@ import ( // ---------------------------------------------------------------- type RecordReaderCSV struct { readerOptions *cli.TReaderOptions + recordsPerBatch int ifs0 byte // Go's CSV library only lets its 'Comma' be a single character } // ---------------------------------------------------------------- -func NewRecordReaderCSV(readerOptions *cli.TReaderOptions) (*RecordReaderCSV, error) { +func NewRecordReaderCSV( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderCSV, error) { if readerOptions.IRS != "\n" && readerOptions.IRS != "\r\n" { return nil, errors.New("CSV IRS cannot be altered; LF vs CR/LF is autodetected") } @@ -29,8 +34,9 @@ func NewRecordReaderCSV(readerOptions *cli.TReaderOptions) (*RecordReaderCSV, er return nil, errors.New("CSV IFS can only be a single character") } return &RecordReaderCSV{ - readerOptions: readerOptions, - ifs0: readerOptions.IFS[0], + readerOptions: readerOptions, + ifs0: readerOptions.IFS[0], + recordsPerBatch: recordsPerBatch, }, nil } @@ -38,7 +44,7 @@ func NewRecordReaderCSV(readerOptions *cli.TReaderOptions) (*RecordReaderCSV, er func (reader *RecordReaderCSV) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -70,7 +76,7 @@ func (reader *RecordReaderCSV) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } // ---------------------------------------------------------------- @@ -78,7 +84,7 @@ func (reader *RecordReaderCSV) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -204,7 +210,7 @@ func (reader *RecordReaderCSV) processHandle( context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) @@ -216,7 +222,7 @@ func (reader *RecordReaderCSV) processHandle( func (reader *RecordReaderCSV) maybeConsumeComment( csvRecord []string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext ) bool { if reader.readerOptions.CommentHandling == cli.CommentsAreData { // Nothing is to be construed as a comment @@ -249,7 +255,7 @@ func (reader *RecordReaderCSV) maybeConsumeComment( csvWriter.Comma = rune(reader.ifs0) csvWriter.Write(csvRecord) csvWriter.Flush() - readerChannel <- types.NewOutputString(buffer.String(), context) + readerChannel <- types.NewOutputStringList(buffer.String(), context) } else /* reader.readerOptions.CommentHandling == cli.SkipComments */ { // discard entirely } diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index 7798935fc2..5d86292727 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -19,6 +19,7 @@ package input // 3,4,5,6 3,4,5 import ( + "container/list" "errors" "fmt" "io" @@ -32,20 +33,29 @@ import ( // ---------------------------------------------------------------- type RecordReaderCSVLite struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int } // ---------------------------------------------------------------- -func NewRecordReaderCSVLite(readerOptions *cli.TReaderOptions) (*RecordReaderCSVLite, error) { +func NewRecordReaderCSVLite( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderCSVLite, error) { return &RecordReaderCSVLite{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } // ---------------------------------------------------------------- -func NewRecordReaderPPRINT(readerOptions *cli.TReaderOptions) (*RecordReaderCSVLite, error) { +func NewRecordReaderPPRINT( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderCSVLite, error) { return &RecordReaderCSVLite{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } @@ -53,7 +63,7 @@ func NewRecordReaderPPRINT(readerOptions *cli.TReaderOptions) (*RecordReaderCSVL func (reader *RecordReaderCSVLite) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -121,7 +131,7 @@ func (reader *RecordReaderCSVLite) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } // ---------------------------------------------------------------- @@ -129,7 +139,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -170,7 +180,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputString(line+"\n", context) + readerChannel <- types.NewOutputStringList(line+"\n", context) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -242,7 +252,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) @@ -256,7 +266,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -293,7 +303,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputString(line+"\n", context) + readerChannel <- types.NewOutputStringList(line+"\n", context) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -373,7 +383,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index 87cb16f9a9..bd39d9f5e8 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -1,6 +1,7 @@ package input import ( + "container/list" "io" "strconv" "strings" @@ -11,19 +12,24 @@ import ( ) type RecordReaderDKVP struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int } -func NewRecordReaderDKVP(readerOptions *cli.TReaderOptions) (*RecordReaderDKVP, error) { +func NewRecordReaderDKVP( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderDKVP, error) { return &RecordReaderDKVP{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } func (reader *RecordReaderDKVP) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -55,21 +61,22 @@ func (reader *RecordReaderDKVP) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } func (reader *RecordReaderDKVP) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) - scanner := NewLineScanner(handle, reader.readerOptions.IRS) - for scanner.Scan() { + lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + + for lineScanner.Scan() { // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit @@ -86,12 +93,12 @@ func (reader *RecordReaderDKVP) processHandle( break } - line := scanner.Text() + line := lineScanner.Text() // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputString(line+"\n", context) + readerChannel <- types.NewOutputStringList(line+"\n", context) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -101,7 +108,7 @@ func (reader *RecordReaderDKVP) processHandle( record := reader.recordFromDKVPLine(line) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) diff --git a/internal/pkg/input/record_reader_factory.go b/internal/pkg/input/record_reader_factory.go index 9c54ce1759..b6f4fa8290 100644 --- a/internal/pkg/input/record_reader_factory.go +++ b/internal/pkg/input/record_reader_factory.go @@ -7,24 +7,24 @@ import ( "github.com/johnkerl/miller/internal/pkg/cli" ) -func Create(readerOptions *cli.TReaderOptions) (IRecordReader, error) { +func Create(readerOptions *cli.TReaderOptions, recordsPerBatch int) (IRecordReader, error) { switch readerOptions.InputFileFormat { case "csv": - return NewRecordReaderCSV(readerOptions) + return NewRecordReaderCSV(readerOptions, recordsPerBatch) case "csvlite": - return NewRecordReaderCSVLite(readerOptions) + return NewRecordReaderCSVLite(readerOptions, recordsPerBatch) case "dkvp": - return NewRecordReaderDKVP(readerOptions) + return NewRecordReaderDKVP(readerOptions, recordsPerBatch) case "json": - return NewRecordReaderJSON(readerOptions) + return NewRecordReaderJSON(readerOptions, recordsPerBatch) case "nidx": - return NewRecordReaderNIDX(readerOptions) + return NewRecordReaderNIDX(readerOptions, recordsPerBatch) case "pprint": - return NewRecordReaderPPRINT(readerOptions) + return NewRecordReaderPPRINT(readerOptions, recordsPerBatch) case "xtab": - return NewRecordReaderXTAB(readerOptions) + return NewRecordReaderXTAB(readerOptions, recordsPerBatch) case "gen": - return NewPseudoReaderGen(readerOptions) + return NewPseudoReaderGen(readerOptions, recordsPerBatch) default: return nil, errors.New(fmt.Sprintf("input file format \"%s\" not found", readerOptions.InputFileFormat)) } diff --git a/internal/pkg/input/record_reader_json.go b/internal/pkg/input/record_reader_json.go index d3256a3021..e805a9dbf9 100644 --- a/internal/pkg/input/record_reader_json.go +++ b/internal/pkg/input/record_reader_json.go @@ -2,6 +2,7 @@ package input import ( "bufio" + "container/list" "errors" "fmt" "io" @@ -15,19 +16,24 @@ import ( ) type RecordReaderJSON struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int } -func NewRecordReaderJSON(readerOptions *cli.TReaderOptions) (*RecordReaderJSON, error) { +func NewRecordReaderJSON( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderJSON, error) { return &RecordReaderJSON{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } func (reader *RecordReaderJSON) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -59,14 +65,14 @@ func (reader *RecordReaderJSON) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } func (reader *RecordReaderJSON) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -116,7 +122,7 @@ func (reader *RecordReaderJSON) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) @@ -143,7 +149,7 @@ func (reader *RecordReaderJSON) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) @@ -187,8 +193,8 @@ func (reader *RecordReaderJSON) processHandle( type JSONCommentEnabledReader struct { lineScanner *bufio.Scanner readerOptions *cli.TReaderOptions - context *types.Context // Needed for channelized stdout-printing logic - readerChannel chan<- *types.RecordAndContext + context *types.Context // Needed for channelized stdout-printing logic + readerChannel chan<- *list.List // list of *types.RecordAndContext // In case a line was ingested which was longer than the read-buffer passed // to us, in which case we need to split up that line and return it over @@ -199,7 +205,7 @@ type JSONCommentEnabledReader struct { func NewJSONCommentEnabledReader( underlying io.Reader, readerOptions *cli.TReaderOptions, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext ) *JSONCommentEnabledReader { return &JSONCommentEnabledReader{ lineScanner: bufio.NewScanner(underlying), @@ -234,7 +240,7 @@ func (bsr *JSONCommentEnabledReader) Read(p []byte) (n int, err error) { if bsr.readerOptions.CommentHandling == cli.PassComments { // Insert the string into the record-output stream, so that goroutine can // print it, resulting in deterministic output-ordering. - bsr.readerChannel <- types.NewOutputString(line+"\n", bsr.context) + bsr.readerChannel <- types.NewOutputStringList(line+"\n", bsr.context) } } } diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go index 9ecf71a48f..b6fab8eb98 100644 --- a/internal/pkg/input/record_reader_nidx.go +++ b/internal/pkg/input/record_reader_nidx.go @@ -1,6 +1,7 @@ package input import ( + "container/list" "io" "strconv" "strings" @@ -11,19 +12,24 @@ import ( ) type RecordReaderNIDX struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int } -func NewRecordReaderNIDX(readerOptions *cli.TReaderOptions) (*RecordReaderNIDX, error) { +func NewRecordReaderNIDX( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderNIDX, error) { return &RecordReaderNIDX{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } func (reader *RecordReaderNIDX) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -55,14 +61,14 @@ func (reader *RecordReaderNIDX) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } func (reader *RecordReaderNIDX) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -92,7 +98,7 @@ func (reader *RecordReaderNIDX) processHandle( // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputString(line+"\n", context) + readerChannel <- types.NewOutputStringList(line+"\n", context) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -103,7 +109,7 @@ func (reader *RecordReaderNIDX) processHandle( record := reader.recordFromNIDXLine(line) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext( + readerChannel <- types.NewRecordAndContextList( record, context, ) diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 97765d43af..9de0a3e75c 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -12,14 +12,19 @@ import ( ) type RecordReaderXTAB struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions + recordsPerBatch int // Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored } // ---------------------------------------------------------------- -func NewRecordReaderXTAB(readerOptions *cli.TReaderOptions) (*RecordReaderXTAB, error) { +func NewRecordReaderXTAB( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderXTAB, error) { return &RecordReaderXTAB{ - readerOptions: readerOptions, + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, }, nil } @@ -27,7 +32,7 @@ func NewRecordReaderXTAB(readerOptions *cli.TReaderOptions) (*RecordReaderXTAB, func (reader *RecordReaderXTAB) Read( filenames []string, context types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -59,14 +64,14 @@ func (reader *RecordReaderXTAB) Read( } } } - readerChannel <- types.NewEndOfStreamMarker(&context) + readerChannel <- types.NewEndOfStreamMarkerList(&context) } func (reader *RecordReaderXTAB) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *types.RecordAndContext, + readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { @@ -102,7 +107,7 @@ func (reader *RecordReaderXTAB) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext(record, context) + readerChannel <- types.NewRecordAndContextList(record, context) linesForRecord = list.New() } @@ -114,7 +119,7 @@ func (reader *RecordReaderXTAB) processHandle( // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputString(line+reader.readerOptions.IFS, context) + readerChannel <- types.NewOutputStringList(line+reader.readerOptions.IFS, context) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -133,7 +138,7 @@ func (reader *RecordReaderXTAB) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContext(record, context) + readerChannel <- types.NewRecordAndContextList(record, context) linesForRecord = list.New() } } diff --git a/internal/pkg/stream/stream.go b/internal/pkg/stream/stream.go index f83e59413c..e969fdceee 100644 --- a/internal/pkg/stream/stream.go +++ b/internal/pkg/stream/stream.go @@ -2,6 +2,7 @@ package stream import ( "bufio" + "container/list" "fmt" "io" "os" @@ -47,8 +48,10 @@ func Stream( // passed through the channels along with each record. initialContext := types.NewContext() - // Instantiate the record-reader - recordReader, err := input.Create(&options.ReaderOptions) + // Instantiate the record-reader. + // RecordsPerBatch is tracked separately from ReaderOptions since join/repl + // may use batch size of 1. + recordReader, err := input.Create(&options.ReaderOptions, options.ReaderOptions.RecordsPerBatch) if err != nil { return err } @@ -60,7 +63,8 @@ func Stream( } // Set up the reader-to-transformer and transformer-to-writer channels. - readerChannel := make(chan *types.RecordAndContext, 10) + readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext + tempChannel := make(chan *types.RecordAndContext, 10) writerChannel := make(chan *types.RecordAndContext, 1) // We're done when a fatal error is registered on input (file not found, @@ -81,7 +85,9 @@ func Stream( bufferedOutputStream := bufio.NewWriter(outputStream) go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) - go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, + // TODO: temp for iterative batched-reader refactor + go tempReader(readerChannel, tempChannel) + go transformers.ChainTransformer(tempChannel, readerDownstreamDoneChannel, recordTransformers, writerChannel, options) go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, bufferedOutputStream, outputIsStdout) @@ -102,3 +108,24 @@ func Stream( return nil } + +func tempReader( + readerChannel <-chan *list.List, // list of *types.RecordAndContext + transformerChannel chan<- *types.RecordAndContext, +) { + done := false + for !done { + racs := <-readerChannel + + for e := racs.Front(); e != nil; e = e.Next() { + rac := e.Value.(*types.RecordAndContext) + transformerChannel <- rac + + if rac.EndOfStream { + done = true + break + } + } + + } +} diff --git a/internal/pkg/transformers/join.go b/internal/pkg/transformers/join.go index f1faba9cc2..d5c44c7b05 100644 --- a/internal/pkg/transformers/join.go +++ b/internal/pkg/transformers/join.go @@ -458,7 +458,8 @@ func (tr *TransformerJoin) ingestLeftFile() { readerOpts := &tr.opts.joinFlagOptions.ReaderOptions // Instantiate the record-reader - recordReader, err := input.Create(readerOpts) + // TODO: perhaps increase recordsPerBatch, and/or refactor + recordReader, err := input.Create(readerOpts, 1) if recordReader == nil { fmt.Fprintf(os.Stderr, "mlr join: %v\n", err) os.Exit(1) @@ -472,7 +473,7 @@ func (tr *TransformerJoin) ingestLeftFile() { initialContext.UpdateForStartOfFile(tr.opts.leftFileName) // Set up channels for the record-reader. - readerChannel := make(chan *types.RecordAndContext, 10) + readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext errorChannel := make(chan error, 1) downstreamDoneChannel := make(chan bool, 1) @@ -493,7 +494,11 @@ func (tr *TransformerJoin) ingestLeftFile() { fmt.Fprintln(os.Stderr, "mlr", ": ", err) os.Exit(1) - case leftrecAndContext := <-readerChannel: + case leftrecsAndContexts := <-readerChannel: + // TODO: temp for batch-reader refactor + lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1) + leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext) + if leftrecAndContext.EndOfStream { done = true break // breaks the switch, not the for, in Golang diff --git a/internal/pkg/transformers/utils/join-bucket-keeper.go b/internal/pkg/transformers/utils/join-bucket-keeper.go index 5aa0290fdb..6af1e2bd94 100644 --- a/internal/pkg/transformers/utils/join-bucket-keeper.go +++ b/internal/pkg/transformers/utils/join-bucket-keeper.go @@ -125,7 +125,7 @@ type JoinBucketKeeper struct { // For streaming through the left-side file recordReader input.IRecordReader context *types.Context - readerChannel <-chan *types.RecordAndContext + readerChannel <-chan *list.List // list of *types.RecordAndContext errorChannel chan error // TODO: merge with leof flag recordReaderDone bool @@ -165,7 +165,7 @@ func NewJoinBucketKeeper( ) *JoinBucketKeeper { // Instantiate the record-reader - recordReader, err := input.Create(joinReaderOptions) + recordReader, err := input.Create(joinReaderOptions, 1) // TODO: maybe increase records per batch if err != nil { fmt.Fprintf(os.Stderr, "mlr join: %v", err) os.Exit(1) @@ -178,7 +178,7 @@ func NewJoinBucketKeeper( initialContext.UpdateForStartOfFile(leftFileName) // Set up channels for the record-reader - readerChannel := make(chan *types.RecordAndContext, 10) + readerChannel := make(chan *list.List, 10) // list of *types.RecordAndContext errorChannel := make(chan error, 1) downstreamDoneChannel := make(chan bool, 1) @@ -570,7 +570,10 @@ func (keeper *JoinBucketKeeper) readRecord() *types.RecordAndContext { case err := <-keeper.errorChannel: fmt.Fprintln(os.Stderr, "mlr", ": ", err) os.Exit(1) - case leftrecAndContext := <-keeper.readerChannel: + case leftrecsAndContexts := <-keeper.readerChannel: + // TODO: temp + lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1) + leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext) if leftrecAndContext.EndOfStream { // end-of-stream marker keeper.recordReaderDone = true return nil diff --git a/internal/pkg/types/context.go b/internal/pkg/types/context.go index 4faa25982f..3a9ac59706 100644 --- a/internal/pkg/types/context.go +++ b/internal/pkg/types/context.go @@ -2,6 +2,7 @@ package types import ( "bytes" + "container/list" "strconv" ) @@ -37,6 +38,16 @@ func NewRecordAndContext( } } +// TODO: temp for batch-reader refactor +func NewRecordAndContextList( + record *Mlrmap, + context *Context, +) *list.List { + ell := list.New() + ell.PushBack(NewRecordAndContext(record, context)) + return ell +} + // For the record-readers to update their initial context as each new record is read. func (rac *RecordAndContext) Copy() *RecordAndContext { if rac == nil { @@ -69,6 +80,16 @@ func NewOutputString( } } +// TODO: temp for batch-reader refactor +func NewOutputStringList( + outputString string, + context *Context, +) *list.List { + ell := list.New() + ell.PushBack(NewOutputString(outputString, context)) + return ell +} + // For the record-readers to update their initial context as each new record is read. func NewEndOfStreamMarker(context *Context) *RecordAndContext { return &RecordAndContext{ @@ -79,6 +100,14 @@ func NewEndOfStreamMarker(context *Context) *RecordAndContext { } } +// TODO: comment +// For the record-readers to update their initial context as each new record is read. +func NewEndOfStreamMarkerList(context *Context) *list.List { + ell := list.New() + ell.PushBack(NewEndOfStreamMarker(context)) + return ell +} + // ---------------------------------------------------------------- type Context struct { FILENAME string diff --git a/todo.txt b/todo.txt index 7ec9ba8d1e..c9fa2ba723 100644 --- a/todo.txt +++ b/todo.txt @@ -27,17 +27,15 @@ PUNCHDOWN LIST o dkvp-reader factor-out ... o mods: ? outputChannel -> *list.List at each transformer -- ? profile first - - un-legacy fflush flag :( - > conditional on isatty stdout - > new fflushWasSpecified - > downcase OFSWasSpecified, HaveRandSeed, et al. + ? readerChannel length 1 or 2 ? + ? cli option for records per batch + ? experiment again with hashed/unhashed - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly > needs factor for-loop to stateful so maybe not - - transfomers w/ reclist: *maybe*, but idchan/odchan too ... invest time after some refactor decions made + - transformers w/ reclist: *maybe*, but idchan/odchan too ... invest time after some refactor decions made - fix record/line sequencing regressions - - tail -f handling - > batch-size 1 on stdin, for repl at least? - > adaptive is-blocking detection -- make sure it's not over-sensitive + - maybe increase records-per-batch in join-bucket-keeper; and/or refactor + - maybe increase records-per-batch in repl; and/or refactor o goals: - keep goroutines -- including per-transformer -- for parallelism - look for flex ideas on how to structure that parallelism @@ -70,10 +68,6 @@ PUNCHDOWN LIST d how to handle tail -f and repl > try unconditional batching and hiding *first* to see how much best-case perf to be had ? lazy type-infer?? needs careful use of accessor-mutators in place of mv.type etc -<<<<<<< HEAD -======= -* note somewhere why NewEndOfStreamMarker instead of channel close -- readers carry final context ->>>>>>> f2e709408 (Rename inputChannel,outputChannel to readerChannel,writerChannel (#772)) * blockers: - keep checking issues From 5c022758e96d2a142d16d949e617f99a06f5a0b2 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 16:48:27 -0500 Subject: [PATCH 06/28] CLI options for records-per-batch and hash-records --- internal/pkg/cli/option_parse.go | 37 +++++++++++++++++++++ internal/pkg/cli/option_types.go | 6 ++-- internal/pkg/input/record_reader_csv.go | 2 +- internal/pkg/input/record_reader_csvlite.go | 4 +-- internal/pkg/input/record_reader_dkvp.go | 2 +- internal/pkg/input/record_reader_nidx.go | 2 +- internal/pkg/input/record_reader_xtab.go | 2 +- internal/pkg/types/mlrmap.go | 16 ++++++++- 8 files changed, 62 insertions(+), 9 deletions(-) diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index ca22992cfe..1fd60d0ebf 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -2557,6 +2557,43 @@ var MiscFlagSection = FlagSection{ }, }, + { + name: "--records-per-batch", + arg: "{n}", + help: `This is an internal parameter for maximum number of records in a batch size. Normally +this does not need to be modified.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + CheckArgCount(args, *pargi, argc, 2) + recordsPerBatch, ok := lib.TryIntFromString(args[*pargi+1]) + if !ok || recordsPerBatch <= 0 { + fmt.Fprintf(os.Stderr, + "%s: --nr-progress-mod argument must be a positive integer; got \"%s\".\n", + "mlr", args[*pargi+1]) + os.Exit(1) + } + options.ReaderOptions.RecordsPerBatch = recordsPerBatch + *pargi += 2 + }, + }, + + { + name: "--hash-records", + help: `This is an internal parameter which normally does not need to be modified.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + types.HashRecords(true) + *pargi += 1 + }, + }, + + { + name: "--no-hash-records", + help: `This is an internal parameter which normally does not need to be modified.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + types.HashRecords(false) + *pargi += 1 + }, + }, + { name: "--infer-none", altNames: []string{"-S"}, diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index 4529a2ac2a..86bcf453ce 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -26,6 +26,8 @@ const DEFAULT_GEN_START_AS_STRING = "1" const DEFAULT_GEN_STEP_AS_STRING = "1" const DEFAULT_GEN_STOP_AS_STRING = "100" +const DEFAULT_RECORDS_PER_BATCH = 500 + type TGeneratorOptions struct { FieldName string StartAsString string @@ -185,8 +187,8 @@ func DefaultReaderOptions() TReaderOptions { StopAsString: DEFAULT_GEN_STOP_AS_STRING, }, - // TODO: make a cli option - RecordsPerBatch: 500, + // TODO: comment + RecordsPerBatch: DEFAULT_RECORDS_PER_BATCH, } } diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index b4c46c17a6..e9b18939cb 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -162,7 +162,7 @@ func (reader *RecordReaderCSV) processHandle( } } - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() nh := len(header) nd := len(csvRecord) diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index 5d86292727..e75966948b 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -220,7 +220,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( return } - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() if !reader.readerOptions.AllowRaggedCSVInput { for i, field := range fields { value := types.MlrvalFromInferredTypeForDataFiles(field) @@ -353,7 +353,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( } } - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() if !reader.readerOptions.AllowRaggedCSVInput { for i, field := range fields { value := types.MlrvalFromInferredTypeForDataFiles(field) diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index bd39d9f5e8..d317c2336c 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -119,7 +119,7 @@ func (reader *RecordReaderDKVP) processHandle( func (reader *RecordReaderDKVP) recordFromDKVPLine( line string, ) *types.Mlrmap { - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() var pairs []string if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go index b6fab8eb98..695d443907 100644 --- a/internal/pkg/input/record_reader_nidx.go +++ b/internal/pkg/input/record_reader_nidx.go @@ -120,7 +120,7 @@ func (reader *RecordReaderNIDX) processHandle( func (reader *RecordReaderNIDX) recordFromNIDXLine( line string, ) *types.Mlrmap { - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() var values []string if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 9de0a3e75c..66c1d7a43c 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -149,7 +149,7 @@ func (reader *RecordReaderXTAB) processHandle( func (reader *RecordReaderXTAB) recordFromXTABLines( lines *list.List, ) (*types.Mlrmap, error) { - record := types.NewMlrmap() + record := types.NewMlrmapAsRecord() for entry := lines.Front(); entry != nil; entry = entry.Next() { line := entry.Value.(string) diff --git a/internal/pkg/types/mlrmap.go b/internal/pkg/types/mlrmap.go index c6b1672b69..0d04495178 100644 --- a/internal/pkg/types/mlrmap.go +++ b/internal/pkg/types/mlrmap.go @@ -53,6 +53,16 @@ package types +// For the C port having this off was a noticeable performance improvement (10-15%). +// For the Go port having it off is a less-noticeable performance improvement (5%). +// Both these figures are for just doing mlr cat. At the moment I'm leaving this +// default-on pending more profiling on more complex record-processing operations +// such as mlr sort. +var hashRecords = true +func HashRecords(onOff bool) { + hashRecords = onOff +} + // ---------------------------------------------------------------- type Mlrmap struct { FieldCount int @@ -80,7 +90,11 @@ type MlrmapPair struct { // ---------------------------------------------------------------- func NewMlrmapAsRecord() *Mlrmap { - return newMlrmapUnhashed() + if hashRecords { + return newMlrmapHashed() + } else { + return newMlrmapUnhashed() + } } func NewMlrmap() *Mlrmap { return newMlrmapHashed() From aaf0c27a6af1a5fb50cd0fd3eb3df5c7a8e5c8c9 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 17:14:43 -0500 Subject: [PATCH 07/28] Push channelized-reader logic into DKVP reader --- internal/pkg/input/record_reader_dkvp.go | 89 +++++++++++++++++++----- todo.txt | 3 +- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index d317c2336c..e02ff8ccff 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -1,6 +1,7 @@ package input import ( + "bufio" "container/list" "io" "strconv" @@ -68,37 +69,91 @@ func (reader *RecordReaderDKVP) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *list.List, // list of *types.RecordAndContext + readerChannel chan<- *list.List, errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) + recordsPerBatch := reader.readerOptions.RecordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan string, recordsPerBatch) + go provideChannelizedLines(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + + eof := false + for !eof { + var recordsAndContexts *list.List + recordsAndContexts, eof = reader.getRecordBatch(linesChannel, recordsPerBatch, context) + //fmt.Fprintf(os.Stderr, "GOT RECORD BATCH OF LENGTH %d\n", recordsAndContexts.Len()) + readerChannel <- recordsAndContexts + } +} - for lineScanner.Scan() { +// TODO: comment +func provideChannelizedLines( + lineScanner *bufio.Scanner, + linesChannel chan<- string, + downstreamDoneChannel <-chan bool, // for mlr head + recordsPerBatch int, +) { + i := 0 + done := false + for !done && lineScanner.Scan() { + i++ // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit // quickly, as it should. - eof := false - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: + if i&recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + } + + linesChannel <- lineScanner.Text() + } + close(linesChannel) // end-of-stream marker +} + +// TODO: comment copiously we're trying to handle slow/fast/short/long +// reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderDKVP) getRecordBatch( + linesChannel <-chan string, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + //fmt.Printf("GRB ENTER\n") + recordsAndContexts = list.New() + eof = false + + for i := 0; i < maxBatchSize; i++ { + //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) + if len(linesChannel) == 0 && i > 0 { + //fmt.Println(" .. BREAK") break } - if eof { + //fmt.Println(" .. B:BLOCK") + line, more := <-linesChannel + //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) + if !more { + eof = true break } - line := lineScanner.Text() - // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputStringList(line+"\n", context) + recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) continue } else if reader.readerOptions.CommentHandling == cli.SkipComments { continue @@ -108,14 +163,14 @@ func (reader *RecordReaderDKVP) processHandle( record := reader.recordFromDKVPLine(line) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) } + + //fmt.Printf("GRB EXIT\n") + return recordsAndContexts, eof } -// ---------------------------------------------------------------- func (reader *RecordReaderDKVP) recordFromDKVPLine( line string, ) *types.Mlrmap { diff --git a/todo.txt b/todo.txt index c9fa2ba723..816c24decf 100644 --- a/todo.txt +++ b/todo.txt @@ -29,7 +29,7 @@ PUNCHDOWN LIST ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? ? cli option for records per batch - ? experiment again with hashed/unhashed + ? experiment again with hashed/unhashed -- mlr sort etc - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly > needs factor for-loop to stateful so maybe not - transformers w/ reclist: *maybe*, but idchan/odchan too ... invest time after some refactor decions made @@ -56,6 +56,7 @@ PUNCHDOWN LIST > do that after hiding ! have an extra eye on CSV-reader perf - mprof split-reader getenv something + - flags for mlr.pprof & trace.out to CLI - hide app-level scan/format under sys-level read/write: also batched easier/line-oriented: record_reader_csvlite.go From 9bdc53d78340339117854419b8a8eab7e0d7a3e9 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 18:36:01 -0500 Subject: [PATCH 08/28] Push batching logic into chain-transformer, transformers, and channel-writer --- internal/pkg/dsl/cst/dump.go | 4 +- internal/pkg/dsl/cst/emit1.go | 4 +- internal/pkg/dsl/cst/emit_emitp.go | 4 +- internal/pkg/dsl/cst/emitf.go | 4 +- internal/pkg/dsl/cst/print.go | 4 +- internal/pkg/input/record_reader_csv.go | 4 +- internal/pkg/output/channel_writer.go | 37 +++++++++-- internal/pkg/output/file-output-handlers.go | 13 ++-- internal/pkg/runtime/state.go | 16 ++--- internal/pkg/stream/stream.go | 28 +-------- .../pkg/transformers/aaa_chain_transformer.go | 61 +++++++++++++++---- .../transformers/aaa_record_transformer.go | 5 +- internal/pkg/transformers/altkv.go | 7 ++- internal/pkg/transformers/bar.go | 22 +++---- internal/pkg/transformers/bootstrap.go | 8 +-- internal/pkg/transformers/cat.go | 17 +++--- internal/pkg/transformers/check.go | 5 +- internal/pkg/transformers/clean-whitespace.go | 23 +++---- internal/pkg/transformers/count-similar.go | 6 +- internal/pkg/transformers/count.go | 20 +++--- internal/pkg/transformers/cut.go | 27 ++++---- internal/pkg/transformers/decimate.go | 7 ++- internal/pkg/transformers/fill-down.go | 17 +++--- internal/pkg/transformers/fill-empty.go | 7 ++- internal/pkg/transformers/flatten.go | 17 +++--- internal/pkg/transformers/format-values.go | 7 ++- internal/pkg/transformers/fraction.go | 6 +- internal/pkg/transformers/gap.go | 21 ++++--- internal/pkg/transformers/grep.go | 9 +-- internal/pkg/transformers/group-by.go | 6 +- internal/pkg/transformers/group-like.go | 6 +- internal/pkg/transformers/having-fields.go | 41 +++++++------ internal/pkg/transformers/head.go | 17 +++--- internal/pkg/transformers/histogram.go | 25 ++++---- internal/pkg/transformers/join.go | 44 ++++++------- internal/pkg/transformers/json-parse.go | 17 +++--- internal/pkg/transformers/json-stringify.go | 17 +++--- internal/pkg/transformers/label.go | 5 +- internal/pkg/transformers/merge-fields.go | 23 +++---- .../transformers/most-or-least-frequent.go | 7 ++- internal/pkg/transformers/nest.go | 50 +++++++-------- internal/pkg/transformers/nothing.go | 5 +- internal/pkg/transformers/put-or-filter.go | 12 ++-- internal/pkg/transformers/regularize.go | 9 +-- .../pkg/transformers/remove-empty-columns.go | 6 +- internal/pkg/transformers/rename.go | 14 ++--- internal/pkg/transformers/reorder.go | 33 +++++----- internal/pkg/transformers/repeat.go | 21 ++++--- internal/pkg/transformers/reshape.go | 29 ++++----- internal/pkg/transformers/sample.go | 7 ++- internal/pkg/transformers/sec2gmt.go | 7 ++- internal/pkg/transformers/sec2gmtdate.go | 7 ++- internal/pkg/transformers/seqgen.go | 9 ++- internal/pkg/transformers/shuffle.go | 6 +- .../pkg/transformers/skip-trivial-records.go | 7 ++- .../pkg/transformers/sort-within-records.go | 13 ++-- internal/pkg/transformers/sort.go | 8 +-- internal/pkg/transformers/stats1.go | 19 +++--- internal/pkg/transformers/stats2.go | 18 +++--- internal/pkg/transformers/step.go | 9 +-- internal/pkg/transformers/tac.go | 6 +- internal/pkg/transformers/tail.go | 6 +- internal/pkg/transformers/tee.go | 7 ++- internal/pkg/transformers/template.go | 7 ++- internal/pkg/transformers/top.go | 13 ++-- internal/pkg/transformers/unflatten.go | 17 +++--- internal/pkg/transformers/uniq.go | 47 +++++++------- internal/pkg/transformers/unsparsify.go | 16 ++--- .../transformers/utils/join-bucket-keeper.go | 6 +- internal/pkg/types/mlrmap.go | 1 + todo.txt | 12 +++- 71 files changed, 568 insertions(+), 477 deletions(-) diff --git a/internal/pkg/dsl/cst/dump.go b/internal/pkg/dsl/cst/dump.go index a3f563ecd3..b0353132eb 100644 --- a/internal/pkg/dsl/cst/dump.go +++ b/internal/pkg/dsl/cst/dump.go @@ -191,8 +191,8 @@ func (node *DumpStatementNode) dumpToStdout( // print it, resulting in deterministic output-ordering. // // The output channel is always non-nil, except for the Miller REPL. - if state.OutputChannel != nil { - state.OutputChannel <- types.NewOutputString(outputString, state.Context) + if state.OutputRecordsAndContexts != nil { + state.OutputRecordsAndContexts.PushBack(types.NewOutputString(outputString, state.Context)) } else { fmt.Println(outputString) } diff --git a/internal/pkg/dsl/cst/emit1.go b/internal/pkg/dsl/cst/emit1.go index 593e893ab8..5a43105c79 100644 --- a/internal/pkg/dsl/cst/emit1.go +++ b/internal/pkg/dsl/cst/emit1.go @@ -65,8 +65,8 @@ func (node *Emit1StatementNode) Execute(state *runtime.State) (*BlockExitPayload return nil, nil } - if state.OutputChannel != nil { - state.OutputChannel <- types.NewRecordAndContext(valueAsMap, state.Context) + if state.OutputRecordsAndContexts != nil { + state.OutputRecordsAndContexts.PushBack(types.NewRecordAndContext(valueAsMap, state.Context)) } else { fmt.Println(valueAsMap.String()) } diff --git a/internal/pkg/dsl/cst/emit_emitp.go b/internal/pkg/dsl/cst/emit_emitp.go index 86470aaff8..dd92ad2d5c 100644 --- a/internal/pkg/dsl/cst/emit_emitp.go +++ b/internal/pkg/dsl/cst/emit_emitp.go @@ -983,8 +983,8 @@ func (node *EmitXStatementNode) emitRecordToRecordStream( state *runtime.State, ) error { // The output channel is always non-nil, except for the Miller REPL. - if state.OutputChannel != nil { - state.OutputChannel <- types.NewRecordAndContext(outrec, state.Context) + if state.OutputRecordsAndContexts != nil { + state.OutputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, state.Context)) } else { fmt.Println(outrec.String()) } diff --git a/internal/pkg/dsl/cst/emitf.go b/internal/pkg/dsl/cst/emitf.go index 6b25ccf4e9..05a91d7a0a 100644 --- a/internal/pkg/dsl/cst/emitf.go +++ b/internal/pkg/dsl/cst/emitf.go @@ -182,8 +182,8 @@ func (node *EmitFStatementNode) emitfToRecordStream( state *runtime.State, ) error { // The output channel is always non-nil, except for the Miller REPL. - if state.OutputChannel != nil { - state.OutputChannel <- types.NewRecordAndContext(outrec, state.Context) + if state.OutputRecordsAndContexts != nil { + state.OutputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, state.Context)) } else { fmt.Println(outrec.String()) } diff --git a/internal/pkg/dsl/cst/print.go b/internal/pkg/dsl/cst/print.go index 1fee1a9ae7..bf3789e117 100644 --- a/internal/pkg/dsl/cst/print.go +++ b/internal/pkg/dsl/cst/print.go @@ -337,8 +337,8 @@ func (node *PrintStatementNode) printToStdout( // print it, resulting in deterministic output-ordering. // The output channel is always non-nil, except for the Miller REPL. - if state.OutputChannel != nil { - state.OutputChannel <- types.NewOutputString(outputString, state.Context) + if state.OutputRecordsAndContexts != nil { + state.OutputRecordsAndContexts.PushBack(types.NewOutputString(outputString, state.Context)) } else { fmt.Print(outputString) } diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index e9b18939cb..43cbb96885 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -17,9 +17,9 @@ import ( // ---------------------------------------------------------------- type RecordReaderCSV struct { - readerOptions *cli.TReaderOptions + readerOptions *cli.TReaderOptions recordsPerBatch int - ifs0 byte // Go's CSV library only lets its 'Comma' be a single character + ifs0 byte // Go's CSV library only lets its 'Comma' be a single character } // ---------------------------------------------------------------- diff --git a/internal/pkg/output/channel_writer.go b/internal/pkg/output/channel_writer.go index b47c901ae0..061583ed51 100644 --- a/internal/pkg/output/channel_writer.go +++ b/internal/pkg/output/channel_writer.go @@ -2,21 +2,48 @@ package output import ( "bufio" + "container/list" "github.com/johnkerl/miller/internal/pkg/cli" "github.com/johnkerl/miller/internal/pkg/types" ) func ChannelWriter( - writerChannel <-chan *types.RecordAndContext, + writerChannel <-chan *list.List, // list of *types.RecordAndContext recordWriter IRecordWriter, writerOptions *cli.TWriterOptions, doneChannel chan<- bool, bufferedOutputStream *bufio.Writer, outputIsStdout bool, ) { + for { - recordAndContext := <-writerChannel + recordsAndContexts := <-writerChannel + done := channelWriterHandleBatch( + recordsAndContexts, + recordWriter, + writerOptions, + bufferedOutputStream, + outputIsStdout, + ) + if done { + doneChannel <- true + break + } + } +} + +// TODO: comment +// Returns true on end of record stream +func channelWriterHandleBatch( + recordsAndContexts *list.List, + recordWriter IRecordWriter, + writerOptions *cli.TWriterOptions, + bufferedOutputStream *bufio.Writer, + outputIsStdout bool, +) bool { + for e := recordsAndContexts.Front(); e != nil; e = e.Next() { + recordAndContext := e.Value.(*types.RecordAndContext) // Three things can come through: // * End-of-stream marker @@ -28,7 +55,6 @@ func ChannelWriter( // output ordering. if !recordAndContext.EndOfStream { - record := recordAndContext.Record if record != nil { recordWriter.Write(record, bufferedOutputStream, outputIsStdout) @@ -49,9 +75,8 @@ func ChannelWriter( // records before printing any, since it needs to compute max width // down columns. recordWriter.Write(nil, bufferedOutputStream, outputIsStdout) - doneChannel <- true - break + return true } - } + return false } diff --git a/internal/pkg/output/file-output-handlers.go b/internal/pkg/output/file-output-handlers.go index 13d94993a0..3991726c7f 100644 --- a/internal/pkg/output/file-output-handlers.go +++ b/internal/pkg/output/file-output-handlers.go @@ -14,6 +14,7 @@ package output import ( "bufio" + "container/list" "errors" "fmt" "io" @@ -212,7 +213,7 @@ type FileOutputHandler struct { // print and dump variants call WriteString. recordWriterOptions *cli.TWriterOptions recordWriter IRecordWriter - recordOutputChannel chan *types.RecordAndContext + recordOutputChannel chan *list.List // list of *types.RecordAndContext recordDoneChannel chan bool } @@ -339,7 +340,11 @@ func (handler *FileOutputHandler) WriteRecordAndContext( } } - handler.recordOutputChannel <- outrecAndContext + // TODO: mahbe refactor to batch better + handler.recordOutputChannel <- types.NewRecordAndContextList( + outrecAndContext.Record, + &outrecAndContext.Context, + ) return nil } @@ -354,7 +359,7 @@ func (handler *FileOutputHandler) setUpRecordWriter() error { } handler.recordWriter = recordWriter - handler.recordOutputChannel = make(chan *types.RecordAndContext, 1) + handler.recordOutputChannel = make(chan *list.List, 1) // list of *types.RecordAndContext handler.recordDoneChannel = make(chan bool, 1) go ChannelWriter( @@ -374,7 +379,7 @@ func (handler *FileOutputHandler) Close() error { if handler.recordOutputChannel != nil { // TODO: see if we need a real context emptyContext := types.Context{} - handler.recordOutputChannel <- types.NewEndOfStreamMarker(&emptyContext) + handler.recordOutputChannel <- types.NewEndOfStreamMarkerList(&emptyContext) // Wait for the output channel to drain done := false diff --git a/internal/pkg/runtime/state.go b/internal/pkg/runtime/state.go index 6889d188bd..19b11414c9 100644 --- a/internal/pkg/runtime/state.go +++ b/internal/pkg/runtime/state.go @@ -7,18 +7,20 @@ package runtime import ( + "container/list" + "github.com/johnkerl/miller/internal/pkg/cli" "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/types" ) type State struct { - Inrec *types.Mlrmap - Context *types.Context - Oosvars *types.Mlrmap - FilterExpression *types.Mlrval - Stack *Stack - OutputChannel chan<- *types.RecordAndContext + Inrec *types.Mlrmap + Context *types.Context + Oosvars *types.Mlrmap + FilterExpression *types.Mlrval + Stack *Stack + OutputRecordsAndContexts *list.List // list of *types.RecordAndContext // For holding "\0".."\9" between where they are set via things like // '$x =~ "(..)_(...)"', and interpolated via things like '$y = "\2:\1"'. RegexCaptures []string @@ -34,7 +36,7 @@ func NewEmptyState(options *cli.TOptions) *State { FilterExpression: types.MLRVAL_TRUE, Stack: NewStack(), - // OutputChannel is assigned after construction + // OutputRecordsAndContexts is assigned after construction // See lib.MakeEmptyRegexCaptures for context. RegexCaptures: lib.MakeEmptyRegexCaptures(), diff --git a/internal/pkg/stream/stream.go b/internal/pkg/stream/stream.go index e969fdceee..8b1b093e76 100644 --- a/internal/pkg/stream/stream.go +++ b/internal/pkg/stream/stream.go @@ -64,8 +64,7 @@ func Stream( // Set up the reader-to-transformer and transformer-to-writer channels. readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext - tempChannel := make(chan *types.RecordAndContext, 10) - writerChannel := make(chan *types.RecordAndContext, 1) + writerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext // We're done when a fatal error is registered on input (file not found, // etc) or when the record-writer has written all its output. We use @@ -85,9 +84,7 @@ func Stream( bufferedOutputStream := bufio.NewWriter(outputStream) go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) - // TODO: temp for iterative batched-reader refactor - go tempReader(readerChannel, tempChannel) - go transformers.ChainTransformer(tempChannel, readerDownstreamDoneChannel, recordTransformers, + go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, writerChannel, options) go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, bufferedOutputStream, outputIsStdout) @@ -108,24 +105,3 @@ func Stream( return nil } - -func tempReader( - readerChannel <-chan *list.List, // list of *types.RecordAndContext - transformerChannel chan<- *types.RecordAndContext, -) { - done := false - for !done { - racs := <-readerChannel - - for e := racs.Front(); e != nil; e = e.Next() { - rac := e.Value.(*types.RecordAndContext) - transformerChannel <- rac - - if rac.EndOfStream { - done = true - break - } - } - - } -} diff --git a/internal/pkg/transformers/aaa_chain_transformer.go b/internal/pkg/transformers/aaa_chain_transformer.go index 2d75c976d7..947c5eeeb1 100644 --- a/internal/pkg/transformers/aaa_chain_transformer.go +++ b/internal/pkg/transformers/aaa_chain_transformer.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "github.com/johnkerl/miller/internal/pkg/cli" "github.com/johnkerl/miller/internal/pkg/types" @@ -142,18 +143,18 @@ import ( // subdivides goroutines for each transformer in the chain, with intermediary // channels between them. func ChainTransformer( - readerRecordChannel <-chan *types.RecordAndContext, + readerRecordChannel <-chan *list.List, // list of *types.RecordAndContext readerDownstreamDoneChannel chan<- bool, // for mlr head -- see also stream.go recordTransformers []IRecordTransformer, // not *recordTransformer since this is an interface - writerRecordChannel chan<- *types.RecordAndContext, + writerRecordChannel chan<- *list.List, // list of *types.RecordAndContext options *cli.TOptions, ) { i := 0 n := len(recordTransformers) - intermediateRecordChannels := make([]chan *types.RecordAndContext, n-1) + intermediateRecordChannels := make([]chan *list.List, n-1) // list of *types.RecordAndContext for i = 0; i < n-1; i++ { - intermediateRecordChannels[i] = make(chan *types.RecordAndContext, 1) + intermediateRecordChannels[i] = make(chan *list.List, 1) // list of *types.RecordAndContext } intermediateDownstreamDoneChannels := make([]chan bool, n) @@ -197,23 +198,52 @@ func ChainTransformer( func runSingleTransformer( recordTransformer IRecordTransformer, - isFirst bool, - inputRecordChannel <-chan *types.RecordAndContext, - outputRecordChannel chan<- *types.RecordAndContext, + isFirstInChain bool, + inputRecordChannel <-chan *list.List, // list of *types.RecordAndContext + outputRecordChannel chan<- *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, options *cli.TOptions, ) { - for { - recordAndContext := <-inputRecordChannel + done := false + for !done { + recordsAndContexts := <-inputRecordChannel + done = runSingleTransformerBatch( + recordsAndContexts, + recordTransformer, + isFirstInChain, + outputRecordChannel, + inputDownstreamDoneChannel, + outputDownstreamDoneChannel, + options, + ) + } +} + +// TODO: comment +// Returns true on end of record stream +func runSingleTransformerBatch( + inputRecordsAndContexts *list.List, // list of types.RecordAndContext + recordTransformer IRecordTransformer, + isFirstInChain bool, + outputRecordChannel chan<- *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, + options *cli.TOptions, +) bool { + outputRecordsAndContexts := list.New() + done := false + + for e := inputRecordsAndContexts.Front(); e != nil; e = e.Next() { + recordAndContext := e.Value.(*types.RecordAndContext) // --nr-progress-mod // TODO: function-pointer this away to reduce instruction count in the // normal case which it isn't used at all. No need to test if {static thing} != 0 // on every record. if options.NRProgressMod != 0 { - if isFirst && recordAndContext.Record != nil { + if isFirstInChain && recordAndContext.Record != nil { context := &recordAndContext.Context if context.NR%options.NRProgressMod == 0 { fmt.Fprintf(os.Stderr, "NR=%d FNR=%d FILENAME=%s\n", context.NR, context.FNR, context.FILENAME) @@ -238,16 +268,23 @@ func runSingleTransformer( if recordAndContext.EndOfStream == true || recordAndContext.Record != nil { recordTransformer.Transform( recordAndContext, + outputRecordsAndContexts, + // TODO: maybe refactor these out of each transformer. + // And/or maybe poll them once per batch not once per record. inputDownstreamDoneChannel, outputDownstreamDoneChannel, - outputRecordChannel, ) } else { - outputRecordChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) } if recordAndContext.EndOfStream { + done = true break } } + + outputRecordChannel <- outputRecordsAndContexts + + return done } diff --git a/internal/pkg/transformers/aaa_record_transformer.go b/internal/pkg/transformers/aaa_record_transformer.go index 7937f8c911..51f464e83c 100644 --- a/internal/pkg/transformers/aaa_record_transformer.go +++ b/internal/pkg/transformers/aaa_record_transformer.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "os" "github.com/johnkerl/miller/internal/pkg/cli" @@ -13,17 +14,17 @@ import ( type IRecordTransformer interface { Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) } type RecordTransformerFunc func( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) type TransformerUsageFunc func( diff --git a/internal/pkg/transformers/altkv.go b/internal/pkg/transformers/altkv.go index 0c9344f884..2fdde54ac3 100644 --- a/internal/pkg/transformers/altkv.go +++ b/internal/pkg/transformers/altkv.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strconv" @@ -91,9 +92,9 @@ func NewTransformerAltkv() (*TransformerAltkv, error) { func (tr *TransformerAltkv) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -122,9 +123,9 @@ func (tr *TransformerAltkv) Transform( pe = pe.Next } - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } else { // end of record stream - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/bar.go b/internal/pkg/transformers/bar.go index 70f3da2dfa..993895b221 100644 --- a/internal/pkg/transformers/bar.go +++ b/internal/pkg/transformers/bar.go @@ -209,28 +209,28 @@ func NewTransformerBar( func (tr *TransformerBar) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerBar) simpleBar( inrecAndContext *types.RecordAndContext, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- func (tr *TransformerBar) processNoAuto( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -254,18 +254,18 @@ func (tr *TransformerBar) processNoAuto( inrec.PutReference(fieldName, types.MlrvalFromString(tr.bars[idx])) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerBar) processAuto( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { tr.recordsForAutoMode.PushBack(inrecAndContext.Copy()) @@ -331,7 +331,7 @@ func (tr *TransformerBar) processAuto( idx = tr.width } - var buffer bytes.Buffer // faster than fmt.Print() separately + var buffer bytes.Buffer buffer.WriteString("[") buffer.WriteString(slo) buffer.WriteString("]") @@ -345,8 +345,8 @@ func (tr *TransformerBar) processAuto( for e := tr.recordsForAutoMode.Front(); e != nil; e = e.Next() { recordAndContext := e.Value.(*types.RecordAndContext) - outputChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) } - outputChannel <- inrecAndContext // Emit the end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // Emit the end-of-stream marker } diff --git a/internal/pkg/transformers/bootstrap.go b/internal/pkg/transformers/bootstrap.go index b379bb0fef..36dff6e4ae 100644 --- a/internal/pkg/transformers/bootstrap.go +++ b/internal/pkg/transformers/bootstrap.go @@ -111,9 +111,9 @@ func NewTransformerBootstrap(nout int) (*TransformerBootstrap, error) { func (tr *TransformerBootstrap) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) // Not end of input stream: retain the record, and emit nothing until end of stream. @@ -154,7 +154,7 @@ func (tr *TransformerBootstrap) Transform( if nout == 0 { // Emit the stream-terminating null record - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -175,9 +175,9 @@ func (tr *TransformerBootstrap) Transform( index := lib.RandRange(0, nin) recordAndContext := recordArray[index] // Already emitted once; copy - outputChannel <- recordAndContext.Copy() + outputRecordsAndContexts.PushBack(recordAndContext.Copy()) } // Emit the stream-terminating null record - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } diff --git a/internal/pkg/transformers/cat.go b/internal/pkg/transformers/cat.go index f16aabfb8a..8a0a5919ad 100644 --- a/internal/pkg/transformers/cat.go +++ b/internal/pkg/transformers/cat.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -148,30 +149,30 @@ func NewTransformerCat( func (tr *TransformerCat) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerCat) simpleCat( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- func (tr *TransformerCat) countersUngrouped( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -179,15 +180,15 @@ func (tr *TransformerCat) countersUngrouped( key := tr.counterFieldName inrec.PrependCopy(key, types.MlrvalFromInt(tr.counter)) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- func (tr *TransformerCat) countersGrouped( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -211,5 +212,5 @@ func (tr *TransformerCat) countersGrouped( key := tr.counterFieldName inrec.PrependCopy(key, types.MlrvalFromInt(counter)) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } diff --git a/internal/pkg/transformers/check.go b/internal/pkg/transformers/check.go index 7c2238f417..0a0a10d268 100644 --- a/internal/pkg/transformers/check.go +++ b/internal/pkg/transformers/check.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -90,12 +91,12 @@ func NewTransformerCheck() (*TransformerCheck, error) { func (tr *TransformerCheck) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/clean-whitespace.go b/internal/pkg/transformers/clean-whitespace.go index 3df93ab4c2..85f349759c 100644 --- a/internal/pkg/transformers/clean-whitespace.go +++ b/internal/pkg/transformers/clean-whitespace.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -132,20 +133,20 @@ func NewTransformerCleanWhitespace( func (tr *TransformerCleanWhitespace) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerCleanWhitespace) cleanWhitespaceInKeysAndValues( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { newrec := types.NewMlrmapAsRecord() @@ -159,18 +160,18 @@ func (tr *TransformerCleanWhitespace) cleanWhitespaceInKeysAndValues( newrec.PutReference(newKey.String(), newValue) } - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } // ---------------------------------------------------------------- func (tr *TransformerCleanWhitespace) cleanWhitespaceInKeys( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { newrec := types.NewMlrmapAsRecord() @@ -182,25 +183,25 @@ func (tr *TransformerCleanWhitespace) cleanWhitespaceInKeys( newrec.PutReference(newKey.String(), pe.Value) } - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } // ---------------------------------------------------------------- func (tr *TransformerCleanWhitespace) cleanWhitespaceInValues( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { for pe := inrecAndContext.Record.Head; pe != nil; pe = pe.Next { pe.Value = types.BIF_clean_whitespace(pe.Value) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/count-similar.go b/internal/pkg/transformers/count-similar.go index 2d7cf4ebf0..39d381c648 100644 --- a/internal/pkg/transformers/count-similar.go +++ b/internal/pkg/transformers/count-similar.go @@ -127,9 +127,9 @@ func NewTransformerCountSimilar( func (tr *TransformerCountSimilar) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -159,10 +159,10 @@ func (tr *TransformerCountSimilar) Transform( recordAndContext := inner.Value.(*types.RecordAndContext) recordAndContext.Record.PutCopy(tr.counterFieldName, mgroupSize) - outputChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) } } - outputChannel <- inrecAndContext // Emit the stream-terminating null record + outputRecordsAndContexts.PushBack(inrecAndContext) // Emit the stream-terminating null record } } diff --git a/internal/pkg/transformers/count.go b/internal/pkg/transformers/count.go index 1134cd4157..02ddd80fe1 100644 --- a/internal/pkg/transformers/count.go +++ b/internal/pkg/transformers/count.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -152,39 +153,38 @@ func NewTransformerCount( func (tr *TransformerCount) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerCount) countUngrouped( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { tr.ungroupedCount++ } else { newrec := types.NewMlrmapAsRecord() newrec.PutCopy(tr.outputFieldName, types.MlrvalFromInt(tr.ungroupedCount)) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) - - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerCount) countGrouped( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -213,7 +213,7 @@ func (tr *TransformerCount) countGrouped( newrec.PutCopy(tr.outputFieldName, types.MlrvalFromInt(tr.groupedCounts.FieldCount)) outrecAndContext := types.NewRecordAndContext(newrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } else { for outer := tr.groupedCounts.Head; outer != nil; outer = outer.Next { @@ -238,10 +238,10 @@ func (tr *TransformerCount) countGrouped( newrec.PutCopy(tr.outputFieldName, types.MlrvalFromInt(countForGroup)) outrecAndContext := types.NewRecordAndContext(newrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/cut.go b/internal/pkg/transformers/cut.go index 7aa5a833aa..8f90ee902e 100644 --- a/internal/pkg/transformers/cut.go +++ b/internal/pkg/transformers/cut.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "regexp" @@ -182,21 +183,21 @@ func NewTransformerCut( func (tr *TransformerCut) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- // mlr cut -f a,b,c func (tr *TransformerCut) includeWithInputOrder( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -209,9 +210,9 @@ func (tr *TransformerCut) includeWithInputOrder( } } outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } @@ -219,9 +220,9 @@ func (tr *TransformerCut) includeWithInputOrder( // mlr cut -o -f a,b,c func (tr *TransformerCut) includeWithArgOrder( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -233,9 +234,9 @@ func (tr *TransformerCut) includeWithArgOrder( } } outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } @@ -243,9 +244,9 @@ func (tr *TransformerCut) includeWithArgOrder( // mlr cut -x -f a,b,c func (tr *TransformerCut) exclude( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -255,15 +256,15 @@ func (tr *TransformerCut) exclude( } } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- func (tr *TransformerCut) processWithRegexes( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -283,8 +284,8 @@ func (tr *TransformerCut) processWithRegexes( newrec.PutReference(pe.Key, pe.Value) } } - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/decimate.go b/internal/pkg/transformers/decimate.go index 051c6fdbb6..661ffeb23f 100644 --- a/internal/pkg/transformers/decimate.go +++ b/internal/pkg/transformers/decimate.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -144,9 +145,9 @@ func NewTransformerDecimate( func (tr *TransformerDecimate) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -165,13 +166,13 @@ func (tr *TransformerDecimate) Transform( remainder := countForGroup % tr.decimateCount if remainder == tr.remainderToKeep { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } countForGroup++ tr.countsByGroup[groupingKey] = countForGroup } else { - outputChannel <- inrecAndContext // Emit the stream-terminating null record + outputRecordsAndContexts.PushBack(inrecAndContext) // Emit the stream-terminating null record } } diff --git a/internal/pkg/transformers/fill-down.go b/internal/pkg/transformers/fill-down.go index f21f516104..f96429a071 100644 --- a/internal/pkg/transformers/fill-down.go +++ b/internal/pkg/transformers/fill-down.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -150,20 +151,20 @@ func NewTransformerFillDown( func (tr *TransformerFillDown) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerFillDown) transformSpecified( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -189,19 +190,19 @@ func (tr *TransformerFillDown) transformSpecified( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerFillDown) transformAll( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -228,9 +229,9 @@ func (tr *TransformerFillDown) transformAll( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/fill-empty.go b/internal/pkg/transformers/fill-empty.go index e947d978a6..420656bd16 100644 --- a/internal/pkg/transformers/fill-empty.go +++ b/internal/pkg/transformers/fill-empty.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -112,9 +113,9 @@ func NewTransformerFillEmpty( func (tr *TransformerFillEmpty) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -126,9 +127,9 @@ func (tr *TransformerFillEmpty) Transform( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { // end of record stream - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } diff --git a/internal/pkg/transformers/flatten.go b/internal/pkg/transformers/flatten.go index a8dfc50b2b..fff0f0fee3 100644 --- a/internal/pkg/transformers/flatten.go +++ b/internal/pkg/transformers/flatten.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -137,20 +138,20 @@ func NewTransformerFlatten( func (tr *TransformerFlatten) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerFlatten) flattenAll( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -159,18 +160,18 @@ func (tr *TransformerFlatten) flattenAll( oFlatSep = tr.options.WriterOptions.FLATSEP } inrec.Flatten(oFlatSep) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerFlatten) flattenSome( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -179,8 +180,8 @@ func (tr *TransformerFlatten) flattenSome( oFlatSep = tr.options.WriterOptions.FLATSEP } inrec.FlattenFields(tr.fieldNameSet, oFlatSep) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/format-values.go b/internal/pkg/transformers/format-values.go index 4aed82aee1..1a74ed14c0 100644 --- a/internal/pkg/transformers/format-values.go +++ b/internal/pkg/transformers/format-values.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -170,13 +171,13 @@ func NewTransformerFormatValues( func (tr *TransformerFormatValues) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker return } @@ -201,5 +202,5 @@ func (tr *TransformerFormatValues) Transform( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } diff --git a/internal/pkg/transformers/fraction.go b/internal/pkg/transformers/fraction.go index 9534898780..75a330ba4e 100644 --- a/internal/pkg/transformers/fraction.go +++ b/internal/pkg/transformers/fraction.go @@ -192,9 +192,9 @@ func NewTransformerFraction( func (tr *TransformerFraction) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { // Not end of stream; pass 1 @@ -285,8 +285,8 @@ func (tr *TransformerFraction) Transform( } } - outputChannel <- types.NewRecordAndContext(outrec, &endOfStreamContext) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &endOfStreamContext)) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/gap.go b/internal/pkg/transformers/gap.go index 34c6782eb0..61e31b4d3e 100644 --- a/internal/pkg/transformers/gap.go +++ b/internal/pkg/transformers/gap.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -139,39 +140,39 @@ func NewTransformerGap( func (tr *TransformerGap) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } func (tr *TransformerGap) transformUnkeyed( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { if tr.recordCount > 0 && tr.recordCount%tr.gapCount == 0 { newrec := types.NewMlrmapAsRecord() - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) tr.recordCount++ } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerGap) transformKeyed( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -183,15 +184,15 @@ func (tr *TransformerGap) transformKeyed( if groupingKey != tr.previousGroupingKey && tr.recordCount > 0 { newrec := types.NewMlrmapAsRecord() - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) tr.previousGroupingKey = groupingKey tr.recordCount++ } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/grep.go b/internal/pkg/transformers/grep.go index 2032e8bfa5..d7942db741 100644 --- a/internal/pkg/transformers/grep.go +++ b/internal/pkg/transformers/grep.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "regexp" @@ -148,9 +149,9 @@ func NewTransformerGrep( func (tr *TransformerGrep) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -159,14 +160,14 @@ func (tr *TransformerGrep) Transform( matches := tr.regexp.Match([]byte(inrecAsString)) if tr.invert { if !matches { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { if matches { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/group-by.go b/internal/pkg/transformers/group-by.go index 3f79f92199..7b84c5180c 100644 --- a/internal/pkg/transformers/group-by.go +++ b/internal/pkg/transformers/group-by.go @@ -116,9 +116,9 @@ func NewTransformerGroupBy( func (tr *TransformerGroupBy) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -141,9 +141,9 @@ func (tr *TransformerGroupBy) Transform( for outer := tr.recordListsByGroup.Head; outer != nil; outer = outer.Next { recordListForGroup := outer.Value.(*list.List) for inner := recordListForGroup.Front(); inner != nil; inner = inner.Next() { - outputChannel <- inner.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(inner.Value.(*types.RecordAndContext)) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/group-like.go b/internal/pkg/transformers/group-like.go index 6946a99620..77a8287099 100644 --- a/internal/pkg/transformers/group-like.go +++ b/internal/pkg/transformers/group-like.go @@ -99,9 +99,9 @@ func NewTransformerGroupLike() (*TransformerGroupLike, error) { func (tr *TransformerGroupLike) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -121,9 +121,9 @@ func (tr *TransformerGroupLike) Transform( for outer := tr.recordListsByGroup.Head; outer != nil; outer = outer.Next { recordListForGroup := outer.Value.(*list.List) for inner := recordListForGroup.Front(); inner != nil; inner = inner.Next() { - outputChannel <- inner.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(inner.Value.(*types.RecordAndContext)) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/having-fields.go b/internal/pkg/transformers/having-fields.go index cdd8986903..2e0fd641ea 100644 --- a/internal/pkg/transformers/having-fields.go +++ b/internal/pkg/transformers/having-fields.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "regexp" @@ -222,20 +223,20 @@ func NewTransformerHavingFields( func (tr *TransformerHavingFields) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerHavingFields) transformHavingFieldsAtLeast( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -244,22 +245,22 @@ func (tr *TransformerHavingFields) transformHavingFieldsAtLeast( if tr.fieldNameSet[pe.Key] { numFound++ if numFound == tr.numFieldNames { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } } } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerHavingFields) transformHavingFieldsWhichAre( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -271,17 +272,17 @@ func (tr *TransformerHavingFields) transformHavingFieldsWhichAre( return } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerHavingFields) transformHavingFieldsAtMost( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -290,18 +291,18 @@ func (tr *TransformerHavingFields) transformHavingFieldsAtMost( return } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } // ---------------------------------------------------------------- func (tr *TransformerHavingFields) transformHavingAllFieldsMatching( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -310,36 +311,36 @@ func (tr *TransformerHavingFields) transformHavingAllFieldsMatching( return } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerHavingFields) transformHavingAnyFieldsMatching( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record for pe := inrec.Head; pe != nil; pe = pe.Next { if tr.regex.MatchString(pe.Key) { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerHavingFields) transformHavingNoFieldsMatching( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -348,8 +349,8 @@ func (tr *TransformerHavingFields) transformHavingNoFieldsMatching( return } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/head.go b/internal/pkg/transformers/head.go index 14b3713fa0..05235f8b70 100644 --- a/internal/pkg/transformers/head.go +++ b/internal/pkg/transformers/head.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -136,24 +137,24 @@ func NewTransformerHead( func (tr *TransformerHead) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } func (tr *TransformerHead) transformUnkeyed( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { tr.unkeyedRecordCount++ if tr.unkeyedRecordCount <= tr.headCount { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else if !tr.wroteDownstreamDone { // Signify to data producers upstream that we'll ignore further // data, so as far as we're concerned they can stop sending it. See @@ -162,15 +163,15 @@ func (tr *TransformerHead) transformUnkeyed( tr.wroteDownstreamDone = true } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } func (tr *TransformerHead) transformKeyed( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -190,10 +191,10 @@ func (tr *TransformerHead) transformKeyed( } if count <= tr.headCount { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/histogram.go b/internal/pkg/transformers/histogram.go index 6262a6975e..47f625b5fa 100644 --- a/internal/pkg/transformers/histogram.go +++ b/internal/pkg/transformers/histogram.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -197,26 +198,26 @@ func NewTransformerHistogram( func (tr *TransformerHistogram) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerHistogram) transformNonAuto( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { tr.ingestNonAuto(inrecAndContext) } else { - tr.emitNonAuto(&inrecAndContext.Context, outputChannel) - outputChannel <- inrecAndContext // end-of-stream marker + tr.emitNonAuto(&inrecAndContext.Context, outputRecordsAndContexts) + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } @@ -249,7 +250,7 @@ func (tr *TransformerHistogram) ingestNonAuto( func (tr *TransformerHistogram) emitNonAuto( endOfStreamContext *types.Context, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { countFieldNames := make(map[string]string) for _, valueFieldName := range tr.valueFieldNames { @@ -274,22 +275,22 @@ func (tr *TransformerHistogram) emitNonAuto( ) } - outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, endOfStreamContext)) } } // ---------------------------------------------------------------- func (tr *TransformerHistogram) transformAuto( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { tr.ingestAuto(inrecAndContext) } else { - tr.emitAuto(&inrecAndContext.Context, outputChannel) - outputChannel <- inrecAndContext // end-of-stream marker + tr.emitAuto(&inrecAndContext.Context, outputRecordsAndContexts) + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } @@ -308,7 +309,7 @@ func (tr *TransformerHistogram) ingestAuto( func (tr *TransformerHistogram) emitAuto( endOfStreamContext *types.Context, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { haveLoHi := false lo := 0.0 @@ -380,6 +381,6 @@ func (tr *TransformerHistogram) emitAuto( ) } - outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, endOfStreamContext)) } } diff --git a/internal/pkg/transformers/join.go b/internal/pkg/transformers/join.go index d5c44c7b05..7f9eca2e2d 100644 --- a/internal/pkg/transformers/join.go +++ b/internal/pkg/transformers/join.go @@ -339,12 +339,12 @@ func NewTransformerJoin( func (tr *TransformerJoin) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- @@ -352,9 +352,9 @@ func (tr *TransformerJoin) Transform( // matching each right record against those. func (tr *TransformerJoin) transformHalfStreaming( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { // This can't be done in the CLI-parser since it requires information which // isn't known until after the CLI-parser is called. @@ -375,7 +375,7 @@ func (tr *TransformerJoin) transformHalfStreaming( iLeftBucket := tr.leftBucketsByJoinFieldValues.Get(groupingKey) if iLeftBucket == nil { if tr.opts.emitRightUnpairables { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { leftBucket := iLeftBucket.(*utils.JoinBucket) @@ -384,29 +384,29 @@ func (tr *TransformerJoin) transformHalfStreaming( tr.formAndEmitPairs( leftBucket.RecordsAndContexts, inrecAndContext, - outputChannel, + outputRecordsAndContexts, ) } } } else if tr.opts.emitRightUnpairables { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { // end of record stream if tr.opts.emitLeftUnpairables { - tr.emitLeftUnpairedBuckets(outputChannel) - tr.emitLeftUnpairables(outputChannel) + tr.emitLeftUnpairedBuckets(outputRecordsAndContexts) + tr.emitLeftUnpairables(outputRecordsAndContexts) } - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerJoin) transformDoublyStreaming( rightRecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { keeper := tr.joinBucketKeeper // keystroke-saver @@ -421,29 +421,29 @@ func (tr *TransformerJoin) transformDoublyStreaming( isPaired = keeper.FindJoinBucket(rightFieldValues) } if tr.opts.emitLeftUnpairables { - keeper.OutputAndReleaseLeftUnpaireds(outputChannel) + keeper.OutputAndReleaseLeftUnpaireds(outputRecordsAndContexts) } else { - keeper.ReleaseLeftUnpaireds(outputChannel) + keeper.ReleaseLeftUnpaireds(outputRecordsAndContexts) } lefts := keeper.JoinBucket.RecordsAndContexts // keystroke-saver if !isPaired && tr.opts.emitRightUnpairables { - outputChannel <- rightRecAndContext + outputRecordsAndContexts.PushBack(rightRecAndContext) } if isPaired && tr.opts.emitPairables && lefts != nil { - tr.formAndEmitPairs(lefts, rightRecAndContext, outputChannel) + tr.formAndEmitPairs(lefts, rightRecAndContext, outputRecordsAndContexts) } } else { // end of record stream keeper.FindJoinBucket(nil) if tr.opts.emitLeftUnpairables { - keeper.OutputAndReleaseLeftUnpaireds(outputChannel) + keeper.OutputAndReleaseLeftUnpaireds(outputRecordsAndContexts) } - outputChannel <- rightRecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(rightRecAndContext) // emit end-of-stream marker } } @@ -532,7 +532,7 @@ func (tr *TransformerJoin) ingestLeftFile() { func (tr *TransformerJoin) formAndEmitPairs( leftRecordsAndContexts *list.List, rightRecordAndContext *types.RecordAndContext, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { ////fmt.Println("-- pairs start") // VERBOSE // Loop over each to-be-paired-with record from the left file. @@ -582,7 +582,7 @@ func (tr *TransformerJoin) formAndEmitPairs( outrecAndContext := types.NewRecordAndContext(outrec, &context) // Emit the new joined record on the downstream channel - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } ////fmt.Println("-- pairs end") // VERBOSE } @@ -598,24 +598,24 @@ func (tr *TransformerJoin) formAndEmitPairs( // in the second category. func (tr *TransformerJoin) emitLeftUnpairables( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { // Loop over each to-be-paired-with record from the left file. for pe := tr.leftUnpairableRecordsAndContexts.Front(); pe != nil; pe = pe.Next() { leftRecordAndContext := pe.Value.(*types.RecordAndContext) - outputChannel <- leftRecordAndContext + outputRecordsAndContexts.PushBack(leftRecordAndContext) } } func (tr *TransformerJoin) emitLeftUnpairedBuckets( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { for pe := tr.leftBucketsByJoinFieldValues.Head; pe != nil; pe = pe.Next { bucket := pe.Value.(*utils.JoinBucket) if !bucket.WasPaired { for pf := bucket.RecordsAndContexts.Front(); pf != nil; pf = pf.Next() { recordAndContext := pf.Value.(*types.RecordAndContext) - outputChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) } } } diff --git a/internal/pkg/transformers/json-parse.go b/internal/pkg/transformers/json-parse.go index fa8c52415d..846feca197 100644 --- a/internal/pkg/transformers/json-parse.go +++ b/internal/pkg/transformers/json-parse.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -124,38 +125,38 @@ func NewTransformerJSONParse( func (tr *TransformerJSONParse) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerJSONParse) jsonParseAll( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record for pe := inrec.Head; pe != nil; pe = pe.Next { pe.JSONParseInPlace() } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerJSONParse) jsonParseSome( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -164,8 +165,8 @@ func (tr *TransformerJSONParse) jsonParseSome( pe.JSONParseInPlace() } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/json-stringify.go b/internal/pkg/transformers/json-stringify.go index ada2a1ff58..68154e18ff 100644 --- a/internal/pkg/transformers/json-stringify.go +++ b/internal/pkg/transformers/json-stringify.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -143,38 +144,38 @@ func NewTransformerJSONStringify( func (tr *TransformerJSONStringify) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerJSONStringify) jsonStringifyAll( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record for pe := inrec.Head; pe != nil; pe = pe.Next { pe.JSONStringifyInPlace(tr.jsonFormatting) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerJSONStringify) jsonStringifySome( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -183,8 +184,8 @@ func (tr *TransformerJSONStringify) jsonStringifySome( pe.JSONStringifyInPlace(tr.jsonFormatting) } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/label.go b/internal/pkg/transformers/label.go index 39c70e550b..c4b5875f5d 100644 --- a/internal/pkg/transformers/label.go +++ b/internal/pkg/transformers/label.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "errors" "fmt" "os" @@ -128,14 +129,14 @@ func NewTransformerLabel( func (tr *TransformerLabel) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record inrec.Label(tr.newNames) } - outputChannel <- inrecAndContext // including end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // including end-of-stream marker } diff --git a/internal/pkg/transformers/merge-fields.go b/internal/pkg/transformers/merge-fields.go index 56f2846a32..bfdef6e9e4 100644 --- a/internal/pkg/transformers/merge-fields.go +++ b/internal/pkg/transformers/merge-fields.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "errors" "fmt" "os" @@ -317,23 +318,23 @@ func NewTransformerMergeFields( func (tr *TransformerMergeFields) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerMergeFields) transformByNameList( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker return } @@ -373,18 +374,18 @@ func (tr *TransformerMergeFields) transformByNameList( inrec.PutReference(key, value) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- func (tr *TransformerMergeFields) transformByNameRegex( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker return } @@ -448,7 +449,7 @@ func (tr *TransformerMergeFields) transformByNameRegex( inrec.PutReference(key, value) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ---------------------------------------------------------------- @@ -460,12 +461,12 @@ func (tr *TransformerMergeFields) transformByNameRegex( func (tr *TransformerMergeFields) transformByCollapsing( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker return } @@ -554,5 +555,5 @@ func (tr *TransformerMergeFields) transformByCollapsing( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } diff --git a/internal/pkg/transformers/most-or-least-frequent.go b/internal/pkg/transformers/most-or-least-frequent.go index 80929ba4c7..fb7a6c7de2 100644 --- a/internal/pkg/transformers/most-or-least-frequent.go +++ b/internal/pkg/transformers/most-or-least-frequent.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "sort" @@ -210,9 +211,9 @@ func NewTransformerMostOrLeastFrequent( func (tr *TransformerMostOrLeastFrequent) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -285,9 +286,9 @@ func (tr *TransformerMostOrLeastFrequent) Transform( if tr.showCounts { outrec.PutReference(tr.outputFieldName, types.MlrvalFromInt(sortPairs[i].count)) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } - outputChannel <- inrecAndContext // End-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // End-of-stream marker } } diff --git a/internal/pkg/transformers/nest.go b/internal/pkg/transformers/nest.go index 35c427ebbd..71e7e7fa6c 100644 --- a/internal/pkg/transformers/nest.go +++ b/internal/pkg/transformers/nest.go @@ -302,27 +302,27 @@ func NewTransformerNest( func (tr *TransformerNest) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerNest) explodeValuesAcrossFields( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record originalEntry := inrec.GetEntry(tr.fieldName) if originalEntry == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -341,25 +341,25 @@ func (tr *TransformerNest) explodeValuesAcrossFields( } inrec.Unlink(originalEntry) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerNest) explodeValuesAcrossRecords( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record mvalue := inrec.Get(tr.fieldName) if mvalue == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } svalue := mvalue.String() @@ -369,27 +369,27 @@ func (tr *TransformerNest) explodeValuesAcrossRecords( for _, piece := range pieces { outrec := inrec.Copy() outrec.PutReference(tr.fieldName, types.MlrvalFromString(piece)) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerNest) explodePairsAcrossFields( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record originalEntry := inrec.GetEntry(tr.fieldName) if originalEntry == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -416,25 +416,25 @@ func (tr *TransformerNest) explodePairsAcrossFields( } inrec.Unlink(originalEntry) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerNest) explodePairsAcrossRecords( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record mvalue := inrec.Get(tr.fieldName) if mvalue == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -455,20 +455,20 @@ func (tr *TransformerNest) explodePairsAcrossRecords( } outrec.Unlink(originalEntry) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerNest) implodeValuesAcrossFields( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -505,26 +505,26 @@ func (tr *TransformerNest) implodeValuesAcrossFields( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerNest) implodeValueAcrossRecords( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record originalEntry := inrec.GetEntry(tr.fieldName) if originalEntry == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -577,11 +577,11 @@ func (tr *TransformerNest) implodeValueAcrossRecords( // tr.fieldName was already present so we'll overwrite it in-place here. outrec.PutReference(tr.fieldName, types.MlrvalFromString(buffer.String())) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } diff --git a/internal/pkg/transformers/nothing.go b/internal/pkg/transformers/nothing.go index c3cead81f9..782731ff39 100644 --- a/internal/pkg/transformers/nothing.go +++ b/internal/pkg/transformers/nothing.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -90,12 +91,12 @@ func NewTransformerNothing() (*TransformerNothing, error) { func (tr *TransformerNothing) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/put-or-filter.go b/internal/pkg/transformers/put-or-filter.go index 2454279f3f..11d5755b4a 100644 --- a/internal/pkg/transformers/put-or-filter.go +++ b/internal/pkg/transformers/put-or-filter.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "errors" "fmt" "os" @@ -506,12 +507,12 @@ func BuildASTFromString(dslString string) (*dsl.AST, error) { func (tr *TransformerPut) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.runtimeState.OutputChannel = outputChannel + tr.runtimeState.OutputRecordsAndContexts = outputRecordsAndContexts inrec := inrecAndContext.Record context := inrecAndContext.Context @@ -545,10 +546,7 @@ func (tr *TransformerPut) Transform( } wantToEmit := lib.BooleanXOR(filterBool, tr.invertFilter) if wantToEmit { - outputChannel <- types.NewRecordAndContext( - outrec, - &context, - ) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &context)) } } @@ -576,6 +574,6 @@ func (tr *TransformerPut) Transform( // indicator. tr.cstRootNode.ProcessEndOfStream() - outputChannel <- types.NewEndOfStreamMarker(&context) + outputRecordsAndContexts.PushBack(types.NewEndOfStreamMarker(&context)) } } diff --git a/internal/pkg/transformers/regularize.go b/internal/pkg/transformers/regularize.go index b0ccf4c854..22ab2fbf8d 100644 --- a/internal/pkg/transformers/regularize.go +++ b/internal/pkg/transformers/regularize.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -96,9 +97,9 @@ func NewTransformerRegularize() (*TransformerRegularize, error) { func (tr *TransformerRegularize) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -109,16 +110,16 @@ func (tr *TransformerRegularize) Transform( previousSortedFieldNames := tr.sortedToOriginal[currentSortedFieldNamesJoined] if previousSortedFieldNames == nil { tr.sortedToOriginal[currentSortedFieldNamesJoined] = currentFieldNames - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { outrec := types.NewMlrmapAsRecord() for _, fieldName := range previousSortedFieldNames { outrec.PutReference(fieldName, inrec.Get(fieldName)) // inrec will be GC'ed } outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/remove-empty-columns.go b/internal/pkg/transformers/remove-empty-columns.go index 076bcbabb8..d6b60f8bd5 100644 --- a/internal/pkg/transformers/remove-empty-columns.go +++ b/internal/pkg/transformers/remove-empty-columns.go @@ -97,9 +97,9 @@ func NewTransformerRemoveEmptyColumns() (*TransformerRemoveEmptyColumns, error) func (tr *TransformerRemoveEmptyColumns) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -128,9 +128,9 @@ func (tr *TransformerRemoveEmptyColumns) Transform( } } - outputChannel <- types.NewRecordAndContext(newrec, &outrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &outrecAndContext.Context)) } - outputChannel <- inrecAndContext // Emit the stream-terminating null record + outputRecordsAndContexts.PushBack(inrecAndContext) // Emit the stream-terminating null record } } diff --git a/internal/pkg/transformers/rename.go b/internal/pkg/transformers/rename.go index ab860fc391..b422bdc610 100644 --- a/internal/pkg/transformers/rename.go +++ b/internal/pkg/transformers/rename.go @@ -191,20 +191,20 @@ func NewTransformerRename( func (tr *TransformerRename) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerRename) transformWithoutRegexes( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -217,15 +217,15 @@ func (tr *TransformerRename) transformWithoutRegexes( } } - outputChannel <- inrecAndContext // including end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // including end-of-stream marker } // ---------------------------------------------------------------- func (tr *TransformerRename) transformWithRegexes( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -252,8 +252,8 @@ func (tr *TransformerRename) transformWithRegexes( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // including end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // including end-of-stream marker } } diff --git a/internal/pkg/transformers/reorder.go b/internal/pkg/transformers/reorder.go index 680849b46e..df1b419d75 100644 --- a/internal/pkg/transformers/reorder.go +++ b/internal/pkg/transformers/reorder.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -172,62 +173,62 @@ func NewTransformerReorder( func (tr *TransformerReorder) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerReorder) reorderToStart( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record for _, fieldName := range tr.fieldNames { inrec.MoveToHead(fieldName) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerReorder) reorderToEnd( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record for _, fieldName := range tr.fieldNames { inrec.MoveToTail(fieldName) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerReorder) reorderBefore( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record if inrec.Get(tr.beforeFieldName) == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -267,24 +268,24 @@ func (tr *TransformerReorder) reorderBefore( for _, fieldName := range tr.fieldNames { inrec.MoveToHead(fieldName) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerReorder) reorderAfter( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record if inrec.Get(tr.afterFieldName) == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -324,9 +325,9 @@ func (tr *TransformerReorder) reorderAfter( for _, fieldName := range tr.fieldNames { inrec.MoveToHead(fieldName) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/repeat.go b/internal/pkg/transformers/repeat.go index 7df62d54a4..7c58bd7b35 100644 --- a/internal/pkg/transformers/repeat.go +++ b/internal/pkg/transformers/repeat.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -161,39 +162,39 @@ func NewTransformerRepeat( func (tr *TransformerRepeat) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerRepeat) repeatByCount( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { for i := 0; i < tr.repeatCount; i++ { - outputChannel <- types.NewRecordAndContext( + outputRecordsAndContexts.PushBack(types.NewRecordAndContext( inrecAndContext.Record.Copy(), &inrecAndContext.Context, - ) + )) } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } // ---------------------------------------------------------------- func (tr *TransformerRepeat) repeatByFieldName( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { fieldValue := inrecAndContext.Record.Get(tr.repeatCountFieldName) @@ -205,13 +206,13 @@ func (tr *TransformerRepeat) repeatByFieldName( return } for i := 0; i < int(repeatCount); i++ { - outputChannel <- types.NewRecordAndContext( + outputRecordsAndContexts.PushBack(types.NewRecordAndContext( inrecAndContext.Record.Copy(), &inrecAndContext.Context, - ) + )) } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/reshape.go b/internal/pkg/transformers/reshape.go index d78861de14..45de4430f4 100644 --- a/internal/pkg/transformers/reshape.go +++ b/internal/pkg/transformers/reshape.go @@ -28,6 +28,7 @@ package transformers // 15 2009-01-05 Z 0.09719105 import ( + "container/list" "fmt" "os" "regexp" @@ -283,20 +284,20 @@ func NewTransformerReshape( func (tr *TransformerReshape) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerReshape) wideToLongNoRegex( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -315,27 +316,27 @@ func (tr *TransformerReshape) wideToLongNoRegex( } if pairs.IsEmpty() { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { for pf := pairs.Head; pf != nil; pf = pf.Next { outrec := inrec.Copy() outrec.PutReference(tr.outputKeyFieldName, types.MlrvalFromString(pf.Key)) outrec.PutReference(tr.outputValueFieldName, pf.Value) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerReshape) wideToLongRegex( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -357,27 +358,27 @@ func (tr *TransformerReshape) wideToLongRegex( } if pairs.IsEmpty() { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { for pf := pairs.Head; pf != nil; pf = pf.Next { outrec := inrec.Copy() outrec.PutReference(tr.outputKeyFieldName, types.MlrvalFromString(pf.Key)) outrec.PutReference(tr.outputValueFieldName, pf.Value) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } } else { - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerReshape) longToWide( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -385,7 +386,7 @@ func (tr *TransformerReshape) longToWide( splitOutKeyFieldValue := inrec.Get(tr.splitOutKeyFieldName) splitOutValueFieldValue := inrec.Get(tr.splitOutValueFieldName) if splitOutKeyFieldValue == nil || splitOutValueFieldValue == nil { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -429,11 +430,11 @@ func (tr *TransformerReshape) longToWide( outrec.PutReference(pg.Key, pg.Value) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } } diff --git a/internal/pkg/transformers/sample.go b/internal/pkg/transformers/sample.go index 8efbc45c66..0ea3ce03df 100644 --- a/internal/pkg/transformers/sample.go +++ b/internal/pkg/transformers/sample.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -130,9 +131,9 @@ func NewTransformerSample( func (tr *TransformerSample) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) // Not end of input stream: retain the record, and emit nothing until end of stream. @@ -153,13 +154,13 @@ func (tr *TransformerSample) Transform( for pe := tr.bucketsByGroup.Head; pe != nil; pe = pe.Next { sampleBucket := pe.Value.(*sampleBucketType) for i := 0; i < sampleBucket.nused; i++ { - outputChannel <- sampleBucket.recordsAndContexts[i] + outputRecordsAndContexts.PushBack(sampleBucket.recordsAndContexts[i]) } } // Emit the stream-terminating null record - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/sec2gmt.go b/internal/pkg/transformers/sec2gmt.go index 8965d7397f..9d99e91291 100644 --- a/internal/pkg/transformers/sec2gmt.go +++ b/internal/pkg/transformers/sec2gmt.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" @@ -150,9 +151,9 @@ func NewTransformerSec2GMT( func (tr *TransformerSec2GMT) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -170,9 +171,9 @@ func (tr *TransformerSec2GMT) Transform( } } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { // End of record stream - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/sec2gmtdate.go b/internal/pkg/transformers/sec2gmtdate.go index d3a3f65f38..d76ae16ef3 100644 --- a/internal/pkg/transformers/sec2gmtdate.go +++ b/internal/pkg/transformers/sec2gmtdate.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" @@ -107,9 +108,9 @@ func NewTransformerSec2GMTDate( func (tr *TransformerSec2GMTDate) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -120,9 +121,9 @@ func (tr *TransformerSec2GMTDate) Transform( inrec.PutReference(fieldName, types.BIF_sec2gmtdate(value)) } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { // End of record stream - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/seqgen.go b/internal/pkg/transformers/seqgen.go index 13227bd8e3..5635016822 100644 --- a/internal/pkg/transformers/seqgen.go +++ b/internal/pkg/transformers/seqgen.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "errors" "fmt" "os" @@ -189,13 +190,11 @@ func NewTransformerSeqgen( }, nil } -// ---------------------------------------------------------------- - func (tr *TransformerSeqgen) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { counter := tr.start context := types.NewNilContext() @@ -233,10 +232,10 @@ func (tr *TransformerSeqgen) Transform( context.UpdateForInputRecord() outrecAndContext := types.NewRecordAndContext(outrec, context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) counter = types.BIF_plus_binary(counter, tr.step) } - outputChannel <- types.NewEndOfStreamMarker(context) + outputRecordsAndContexts.PushBack(types.NewEndOfStreamMarker(context)) } diff --git a/internal/pkg/transformers/shuffle.go b/internal/pkg/transformers/shuffle.go index 261c172dbb..97ccd07ded 100644 --- a/internal/pkg/transformers/shuffle.go +++ b/internal/pkg/transformers/shuffle.go @@ -101,9 +101,9 @@ func NewTransformerShuffle() (*TransformerShuffle, error) { func (tr *TransformerShuffle) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) // Not end of input stream: retain the record, and emit nothing until end of stream. @@ -149,10 +149,10 @@ func (tr *TransformerShuffle) Transform( // all input records have ownership transferred exactly once. So, there are no // records to copy here. for i := 0; i < n; i++ { - outputChannel <- array[images[i]] + outputRecordsAndContexts.PushBack(array[images[i]]) } // Emit the stream-terminating null record - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/skip-trivial-records.go b/internal/pkg/transformers/skip-trivial-records.go index 69d68c01d1..1063ac6252 100644 --- a/internal/pkg/transformers/skip-trivial-records.go +++ b/internal/pkg/transformers/skip-trivial-records.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -92,9 +93,9 @@ func NewTransformerSkipTrivialRecords() (*TransformerSkipTrivialRecords, error) func (tr *TransformerSkipTrivialRecords) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -108,10 +109,10 @@ func (tr *TransformerSkipTrivialRecords) Transform( } if hasAny { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/sort-within-records.go b/internal/pkg/transformers/sort-within-records.go index de520085b8..871a8e2f79 100644 --- a/internal/pkg/transformers/sort-within-records.go +++ b/internal/pkg/transformers/sort-within-records.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -109,38 +110,38 @@ func NewTransformerSortWithinRecords( func (tr *TransformerSortWithinRecords) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerSortWithinRecords) transformNonrecursively( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record inrec.SortByKey() } - outputChannel <- inrecAndContext // including end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // including end-of-stream marker } // ---------------------------------------------------------------- func (tr *TransformerSortWithinRecords) transformRecursively( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record inrec.SortByKeyRecursively() } - outputChannel <- inrecAndContext // including end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // including end-of-stream marker } diff --git a/internal/pkg/transformers/sort.go b/internal/pkg/transformers/sort.go index 0e9d54e2a8..a2b90223af 100644 --- a/internal/pkg/transformers/sort.go +++ b/internal/pkg/transformers/sort.go @@ -303,9 +303,9 @@ type GroupingKeysAndMlrvals struct { func (tr *TransformerSort) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -367,15 +367,15 @@ func (tr *TransformerSort) Transform( iRecordsInGroup := tr.recordListsByGroup.Get(groupingKeyAndMlrvals.groupingKey) recordsInGroup := iRecordsInGroup.(*list.List) for iRecord := recordsInGroup.Front(); iRecord != nil; iRecord = iRecord.Next() { - outputChannel <- iRecord.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(iRecord.Value.(*types.RecordAndContext)) } } for iRecord := tr.spillGroup.Front(); iRecord != nil; iRecord = iRecord.Next() { - outputChannel <- iRecord.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(iRecord.Value.(*types.RecordAndContext)) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/stats1.go b/internal/pkg/transformers/stats1.go index 07373a3d7d..602b2b3578 100644 --- a/internal/pkg/transformers/stats1.go +++ b/internal/pkg/transformers/stats1.go @@ -2,6 +2,7 @@ package transformers import ( "bytes" + "container/list" "errors" "fmt" "os" @@ -360,21 +361,21 @@ func NewTransformerStats1( // the end-of-stream marker. func (tr *TransformerStats1) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { - tr.handleInputRecord(inrecAndContext, outputChannel) + tr.handleInputRecord(inrecAndContext, outputRecordsAndContexts) } else { - tr.handleEndOfRecordStream(inrecAndContext, outputChannel) + tr.handleEndOfRecordStream(inrecAndContext, outputRecordsAndContexts) } } func (tr *TransformerStats1) handleInputRecord( inrecAndContext *types.RecordAndContext, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { inrec := inrecAndContext.Record @@ -415,7 +416,7 @@ func (tr *TransformerStats1) handleInputRecord( level2.(*lib.OrderedMap), inrec, ) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } @@ -586,10 +587,10 @@ func (tr *TransformerStats1) matchValueFieldName( func (tr *TransformerStats1) handleEndOfRecordStream( inrecAndContext *types.RecordAndContext, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { if tr.doIterativeStats { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker return } @@ -607,10 +608,10 @@ func (tr *TransformerStats1) handleEndOfRecordStream( newrec, ) - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } func (tr *TransformerStats1) emitIntoOutputRecord( diff --git a/internal/pkg/transformers/stats2.go b/internal/pkg/transformers/stats2.go index c4d63571a7..59d771abf8 100644 --- a/internal/pkg/transformers/stats2.go +++ b/internal/pkg/transformers/stats2.go @@ -272,9 +272,9 @@ func NewTransformerStats2( func (tr *TransformerStats2) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -283,19 +283,19 @@ func (tr *TransformerStats2) Transform( if tr.doIterativeStats { // The input record is modified in this case, with new fields appended - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // if tr.doHoldAndFit, the input record is held by the ingestor } else { // end of record stream if !tr.doIterativeStats { // in the iterative case, already emitted per-record if tr.doHoldAndFit { - tr.fit(outputChannel) + tr.fit(outputRecordsAndContexts) } else { - tr.emit(outputChannel, &inrecAndContext.Context) + tr.emit(outputRecordsAndContexts, &inrecAndContext.Context) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } @@ -389,7 +389,7 @@ func (tr *TransformerStats2) ingest( // ---------------------------------------------------------------- func (tr *TransformerStats2) emit( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext context *types.Context, ) { for pa := tr.namedAccumulators.Head; pa != nil; pa = pa.Next { @@ -422,7 +422,7 @@ func (tr *TransformerStats2) emit( } } - outputChannel <- types.NewRecordAndContext(outrec, context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, context)) } } @@ -440,7 +440,7 @@ func (tr *TransformerStats2) populateRecord( } func (tr *TransformerStats2) fit( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { for pa := tr.namedAccumulators.Head; pa != nil; pa = pa.Next { groupingKey := pa.Key @@ -475,7 +475,7 @@ func (tr *TransformerStats2) fit( } } - outputChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) } } } diff --git a/internal/pkg/transformers/step.go b/internal/pkg/transformers/step.go index e2eeb60e6d..b71b5bcf89 100644 --- a/internal/pkg/transformers/step.go +++ b/internal/pkg/transformers/step.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "errors" "fmt" "os" @@ -235,13 +236,13 @@ func NewTransformerStep( func (tr *TransformerStep) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if inrecAndContext.EndOfStream { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -252,7 +253,7 @@ func (tr *TransformerStep) Transform( // Grouping key is "s,t" groupingKey, gok := inrec.GetSelectedValuesJoined(tr.groupByFieldNames) if !gok { // current record doesn't have fields to be stepped; pass it along - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) return } @@ -302,7 +303,7 @@ func (tr *TransformerStep) Transform( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } // ================================================================ diff --git a/internal/pkg/transformers/tac.go b/internal/pkg/transformers/tac.go index 8f9395253a..6cbd02fc3f 100644 --- a/internal/pkg/transformers/tac.go +++ b/internal/pkg/transformers/tac.go @@ -92,9 +92,9 @@ func NewTransformerTac() (*TransformerTac, error) { func (tr *TransformerTac) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -102,8 +102,8 @@ func (tr *TransformerTac) Transform( } else { // end of stream for e := tr.recordsAndContexts.Front(); e != nil; e = e.Next() { - outputChannel <- e.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(e.Value.(*types.RecordAndContext)) } - outputChannel <- types.NewEndOfStreamMarker(&inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewEndOfStreamMarker(&inrecAndContext.Context)) } } diff --git a/internal/pkg/transformers/tail.go b/internal/pkg/transformers/tail.go index 4b369f7f38..f96b121908 100644 --- a/internal/pkg/transformers/tail.go +++ b/internal/pkg/transformers/tail.go @@ -126,9 +126,9 @@ func NewTransformerTail( func (tr *TransformerTail) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -155,9 +155,9 @@ func (tr *TransformerTail) Transform( for outer := tr.recordListsByGroup.Head; outer != nil; outer = outer.Next { recordListForGroup := outer.Value.(*list.List) for inner := recordListForGroup.Front(); inner != nil; inner = inner.Next() { - outputChannel <- inner.Value.(*types.RecordAndContext) + outputRecordsAndContexts.PushBack(inner.Value.(*types.RecordAndContext)) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/tee.go b/internal/pkg/transformers/tee.go index f8e0b55ca1..1995d74a7a 100644 --- a/internal/pkg/transformers/tee.go +++ b/internal/pkg/transformers/tee.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -167,9 +168,9 @@ func NewTransformerTee( func (tr *TransformerTee) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { // If we receive a downstream-done flag from a transformer downstream from @@ -203,7 +204,7 @@ func (tr *TransformerTee) Transform( os.Exit(1) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { err := tr.fileOutputHandler.Close() if err != nil { @@ -215,6 +216,6 @@ func (tr *TransformerTee) Transform( fmt.Fprintln(os.Stderr, err) os.Exit(1) } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/template.go b/internal/pkg/transformers/template.go index 2934d0427c..21a65c0881 100644 --- a/internal/pkg/transformers/template.go +++ b/internal/pkg/transformers/template.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -136,9 +137,9 @@ func NewTransformerTemplate( func (tr *TransformerTemplate) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { @@ -153,8 +154,8 @@ func (tr *TransformerTemplate) Transform( } } outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } else { - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } diff --git a/internal/pkg/transformers/top.go b/internal/pkg/transformers/top.go index bbd0ba37fa..647fab2c99 100644 --- a/internal/pkg/transformers/top.go +++ b/internal/pkg/transformers/top.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -172,15 +173,15 @@ func NewTransformerTop( func (tr *TransformerTop) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) if !inrecAndContext.EndOfStream { tr.ingest(inrecAndContext) } else { - tr.emit(inrecAndContext, outputChannel) + tr.emit(inrecAndContext, outputRecordsAndContexts) } } @@ -237,7 +238,7 @@ func (tr *TransformerTop) ingest( // ---------------------------------------------------------------- func (tr *TransformerTop) emit( inrecAndContext *types.RecordAndContext, - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { for pa := tr.groups.Head; pa != nil; pa = pa.Next { groupingKey := pa.Key @@ -251,7 +252,7 @@ func (tr *TransformerTop) emit( for pb := secondLevel.Head; pb != nil; pb = pb.Next { topKeeper := pb.Value.(*utils.TopKeeper) for i := 0; i < topKeeper.GetSize(); i++ { - outputChannel <- topKeeper.TopRecordsAndContexts[i].Copy() + outputRecordsAndContexts.PushBack(topKeeper.TopRecordsAndContexts[i].Copy()) } } @@ -280,10 +281,10 @@ func (tr *TransformerTop) emit( } } - outputChannel <- types.NewRecordAndContext(newrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) } } } - outputChannel <- inrecAndContext // emit end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker } diff --git a/internal/pkg/transformers/unflatten.go b/internal/pkg/transformers/unflatten.go index cd0876cd87..8b533f91b7 100644 --- a/internal/pkg/transformers/unflatten.go +++ b/internal/pkg/transformers/unflatten.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -136,20 +137,20 @@ func NewTransformerUnflatten( func (tr *TransformerUnflatten) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerUnflatten) unflattenAll( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -158,18 +159,18 @@ func (tr *TransformerUnflatten) unflattenAll( oFlatSep = tr.options.WriterOptions.FLATSEP } inrec.Unflatten(oFlatSep) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUnflatten) unflattenSome( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -178,8 +179,8 @@ func (tr *TransformerUnflatten) unflattenSome( oFlatSep = tr.options.WriterOptions.FLATSEP } inrec.UnflattenFields(tr.fieldNameSet, oFlatSep) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/uniq.go b/internal/pkg/transformers/uniq.go index e68a55b9cc..d4f6ddd071 100644 --- a/internal/pkg/transformers/uniq.go +++ b/internal/pkg/transformers/uniq.go @@ -1,6 +1,7 @@ package transformers import ( + "container/list" "fmt" "os" "strings" @@ -337,12 +338,12 @@ func NewTransformerUniq( func (tr *TransformerUniq) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- @@ -350,9 +351,9 @@ func (tr *TransformerUniq) Transform( // non-streaming, with output at end of stream. func (tr *TransformerUniq) transformUniqifyEntireRecordsShowCounts( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -373,10 +374,10 @@ func (tr *TransformerUniq) transformUniqifyEntireRecordsShowCounts( icount := tr.uniqifiedRecordCounts.Get(pe.Key) mcount := types.MlrvalFromInt(icount.(int)) outrecAndContext.Record.PrependReference(tr.outputFieldName, mcount) - outputChannel <- outrecAndContext + outputRecordsAndContexts.PushBack(outrecAndContext) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } @@ -386,9 +387,9 @@ func (tr *TransformerUniq) transformUniqifyEntireRecordsShowCounts( // of stream. func (tr *TransformerUniq) transformUniqifyEntireRecordsShowNumDistinctOnly( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -403,9 +404,9 @@ func (tr *TransformerUniq) transformUniqifyEntireRecordsShowNumDistinctOnly( tr.outputFieldName, types.MlrvalFromInt(tr.uniqifiedRecordCounts.FieldCount), ) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } @@ -413,9 +414,9 @@ func (tr *TransformerUniq) transformUniqifyEntireRecordsShowNumDistinctOnly( // Print each unique record only once (on first occurrence). func (tr *TransformerUniq) transformUniqifyEntireRecords( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -423,21 +424,21 @@ func (tr *TransformerUniq) transformUniqifyEntireRecords( recordAsString := inrec.String() if !tr.uniqifiedRecordCounts.Has(recordAsString) { tr.uniqifiedRecordCounts.Put(recordAsString, 1) - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } } else { // end of record stream - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUniq) transformUnlashed( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -479,20 +480,20 @@ func (tr *TransformerUniq) transformUnlashed( tr.unlashedCountValues.Get(fieldName).(*lib.OrderedMap).Get(fieldValueString).(*types.Mlrval), ) outrec.PutReference("count", types.MlrvalFromInt(pf.Value.(int))) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUniq) transformNumDistinctOnly( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -513,18 +514,18 @@ func (tr *TransformerUniq) transformNumDistinctOnly( "count", types.MlrvalFromInt(tr.countsByGroup.FieldCount), ) - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUniq) transformWithCounts( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -557,19 +558,19 @@ func (tr *TransformerUniq) transformWithCounts( types.MlrvalFromInt(pa.Value.(int)), ) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUniq) transformWithoutCounts( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -592,13 +593,13 @@ func (tr *TransformerUniq) transformWithoutCounts( ) } - outputChannel <- types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } else { tr.countsByGroup.Put(groupingKey, iCount.(int)+1) } } else { // end of record stream - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/unsparsify.go b/internal/pkg/transformers/unsparsify.go index a3f740908f..caaa3301a9 100644 --- a/internal/pkg/transformers/unsparsify.go +++ b/internal/pkg/transformers/unsparsify.go @@ -145,20 +145,20 @@ func NewTransformerUnsparsify( func (tr *TransformerUnsparsify) Transform( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, inputDownstreamDoneChannel, outputDownstreamDoneChannel, outputChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- func (tr *TransformerUnsparsify) transformNonStreaming( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -184,19 +184,19 @@ func (tr *TransformerUnsparsify) transformNonStreaming( } } - outputChannel <- types.NewRecordAndContext(newrec, &outrecAndContext.Context) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &outrecAndContext.Context)) } - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } // ---------------------------------------------------------------- func (tr *TransformerUnsparsify) transformStreaming( inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext inputDownstreamDoneChannel <-chan bool, outputDownstreamDoneChannel chan<- bool, - outputChannel chan<- *types.RecordAndContext, ) { if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record @@ -207,9 +207,9 @@ func (tr *TransformerUnsparsify) transformStreaming( } } - outputChannel <- inrecAndContext + outputRecordsAndContexts.PushBack(inrecAndContext) } else { - outputChannel <- inrecAndContext // end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } diff --git a/internal/pkg/transformers/utils/join-bucket-keeper.go b/internal/pkg/transformers/utils/join-bucket-keeper.go index 6af1e2bd94..5132ec2a7c 100644 --- a/internal/pkg/transformers/utils/join-bucket-keeper.go +++ b/internal/pkg/transformers/utils/join-bucket-keeper.go @@ -529,7 +529,7 @@ func (keeper *JoinBucketKeeper) markRemainingsAsUnpaired() { // ---------------------------------------------------------------- // TODO: comment func (keeper *JoinBucketKeeper) OutputAndReleaseLeftUnpaireds( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { for { element := keeper.leftUnpaireds.Front() @@ -537,13 +537,13 @@ func (keeper *JoinBucketKeeper) OutputAndReleaseLeftUnpaireds( break } recordAndContext := element.Value.(*types.RecordAndContext) - outputChannel <- recordAndContext + outputRecordsAndContexts.PushBack(recordAndContext) keeper.leftUnpaireds.Remove(element) } } func (keeper *JoinBucketKeeper) ReleaseLeftUnpaireds( - outputChannel chan<- *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext ) { for { element := keeper.leftUnpaireds.Front() diff --git a/internal/pkg/types/mlrmap.go b/internal/pkg/types/mlrmap.go index 0d04495178..683792559c 100644 --- a/internal/pkg/types/mlrmap.go +++ b/internal/pkg/types/mlrmap.go @@ -59,6 +59,7 @@ package types // default-on pending more profiling on more complex record-processing operations // such as mlr sort. var hashRecords = true + func HashRecords(onOff bool) { hashRecords = onOff } diff --git a/todo.txt b/todo.txt index 816c24decf..0a8e0497e8 100644 --- a/todo.txt +++ b/todo.txt @@ -26,10 +26,20 @@ PUNCHDOWN LIST - make a 2nd/3rd cmd main w/ simple model & tweak that o dkvp-reader factor-out ... o mods: + - downstreamDone batchify + > head + - writes + > seqgen + - reads and writes + - might involve batch-size somehow -- ? + > tee + - reads + > everything else + - reads and writes ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? - ? cli option for records per batch ? experiment again with hashed/unhashed -- mlr sort etc + - checklist channelize all record-reader types - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly > needs factor for-loop to stateful so maybe not - transformers w/ reclist: *maybe*, but idchan/odchan too ... invest time after some refactor decions made From 0335c4e70ff111762275cce6cafb5a59c52516b3 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 20:18:01 -0500 Subject: [PATCH 09/28] foo --- Makefile | 4 +- cmd/mlr/main.go | 8 ++- go.mod | 2 + go.sum | 2 + internal/pkg/climain/mlrcli_parse.go | 3 + internal/pkg/input/record_reader_dkvp.go | 89 +++++++++++++++++++++++- internal/pkg/stream/stream.go | 24 ++++--- todo.txt | 2 + 8 files changed, 122 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index b2868ab856..47c0ef76ff 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,8 @@ so: install sure: build check mlr: go build github.com/johnkerl/miller/cmd/mlr +mprof: + go build github.com/johnkerl/miller/cmd/mprof # Please see comments in ./create-release-tarball as well as # https://miller.readthedocs.io/en/latest/build/#creating-a-new-release-for-developers @@ -80,4 +82,4 @@ release_tarball: build check ./create-release-tarball # Go does its own dependency management, outside of make. -.PHONY: build mlr check unit_test regression_test fmt dev +.PHONY: build mlr mprof check unit_test regression_test fmt dev diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go index 71f0de6823..87505f1b61 100644 --- a/cmd/mlr/main.go +++ b/cmd/mlr/main.go @@ -9,6 +9,7 @@ import ( "runtime/pprof" "strconv" + "github.com/pkg/profile" // for trace.out "github.com/johnkerl/miller/internal/pkg/entrypoint" ) @@ -59,7 +60,12 @@ func main() { defer pprof.StopCPUProfile() fmt.Fprintf(os.Stderr, "CPU profile started.\n") - defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished: go tool pprof -http=:8080 %s\n", profFilename) + } + + if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") } // This will obtain os.Args and go from there. All the usual contents of diff --git a/go.mod b/go.mod index cd4683f9a3..6348401c68 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,5 @@ module github.com/johnkerl/miller + // The repo is 'miller' and the executable is 'mlr', going back many years and // predating the Go port. // @@ -16,6 +17,7 @@ require ( github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/lestrrat-go/strftime v1.0.4 github.com/mattn/go-isatty v0.0.12 + github.com/pkg/profile v1.6.0 // indirect golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 golang.org/x/term v0.0.0-20201210144234-2321bbc49cbf ) diff --git a/go.sum b/go.sum index af8117a912..04746f2e16 100644 --- a/go.sum +++ b/go.sum @@ -12,6 +12,8 @@ github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHX github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/profile v1.6.0 h1:hUDfIISABYI59DyeB3OTay/HxSRwTQ8rB/H83k6r5dM= +github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= diff --git a/internal/pkg/climain/mlrcli_parse.go b/internal/pkg/climain/mlrcli_parse.go index bf9f560b65..8b61bbd5cb 100644 --- a/internal/pkg/climain/mlrcli_parse.go +++ b/internal/pkg/climain/mlrcli_parse.go @@ -133,6 +133,9 @@ func parseCommandLinePassOne( // Already handled in main(); ignore here, and don't send it to pass two. cli.CheckArgCount(args, argi, argc, 1) argi += 2 + } else if args[argi] == "--traceprofile" { + // Already handled in main(); ignore here, and don't send it to pass two. + argi += 1 } else if args[argi] == "--version" { // Exiting flag: handle it immediately. fmt.Printf("mlr %s\n", version.STRING) diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index e02ff8ccff..7c05d3d96e 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -70,7 +70,7 @@ func (reader *RecordReaderDKVP) processHandle( filename string, context *types.Context, readerChannel chan<- *list.List, - errorChannel chan error, + errorChannel chan<- error, downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) @@ -104,7 +104,7 @@ func provideChannelizedLines( // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit // quickly, as it should. - if i&recordsPerBatch == 0 { + if i%recordsPerBatch == 0 { select { case _ = <-downstreamDoneChannel: done = true @@ -122,6 +122,91 @@ func provideChannelizedLines( close(linesChannel) // end-of-stream marker } +//// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile +//// TODO: comment +//func provideChannelizedLines( +// lineScanner *bufio.Scanner, +// linesChannel chan<- *list.List, +// downstreamDoneChannel <-chan bool, // for mlr head +// recordsPerBatch int, +//) { +// i := 0 +// done := false +// +// lines := list.New() +// +// for !done && lineScanner.Scan() { +// i++ +// +// lines.PushBack(lineScanner.Text()) +// +// // See if downstream processors will be ignoring further data (e.g. mlr +// // head). If so, stop reading. This makes 'mlr head hugefile' exit +// // quickly, as it should. +// if i%recordsPerBatch == 0 { +// select { +// case _ = <-downstreamDoneChannel: +// done = true +// break +// default: +// break +// } +// if done { +// break +// } +// linesChannel <- lines +// lines = list.New() +// } +// +// //linesChannel <- lineScanner.Text() +// } +// linesChannel <- lines +// close(linesChannel) // end-of-stream marker +//} + +//// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile +//// TODO: comment copiously we're trying to handle slow/fast/short/long +//// reads: tail -f, smallfile, bigfile. +//func (reader *RecordReaderDKVP) getRecordBatch( +// linesChannel <-chan *list.List, +// maxBatchSize int, +// context *types.Context, +//) ( +// recordsAndContexts *list.List, +// eof bool, +//) { +// //fmt.Printf("GRB ENTER\n") +// recordsAndContexts = list.New() +// +// lines, more := <-linesChannel +// if !more { +// return recordsAndContexts, true +// } +// +// for e := lines.Front(); e != nil; e = e.Next() { +// line := e.Value.(string) +// +// // Check for comments-in-data feature +// if strings.HasPrefix(line, reader.readerOptions.CommentString) { +// if reader.readerOptions.CommentHandling == cli.PassComments { +// recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) +// continue +// } else if reader.readerOptions.CommentHandling == cli.SkipComments { +// continue +// } +// // else comments are data +// } +// +// record := reader.recordFromDKVPLine(line) +// context.UpdateForInputRecord() +// recordAndContext := types.NewRecordAndContext(record, context) +// recordsAndContexts.PushBack(recordAndContext) +// } +// +// //fmt.Printf("GRB EXIT\n") +// return recordsAndContexts, false +//} + // TODO: comment copiously we're trying to handle slow/fast/short/long // reads: tail -f, smallfile, bigfile. func (reader *RecordReaderDKVP) getRecordBatch( diff --git a/internal/pkg/stream/stream.go b/internal/pkg/stream/stream.go index 8b1b093e76..93e4b168df 100644 --- a/internal/pkg/stream/stream.go +++ b/internal/pkg/stream/stream.go @@ -69,25 +69,33 @@ func Stream( // We're done when a fatal error is registered on input (file not found, // etc) or when the record-writer has written all its output. We use // channels to communicate both of these conditions. - errorChannel := make(chan error, 1) - doneWritingChannel := make(chan bool, 1) + errorChannel := make(chan error, 0) + doneWritingChannel := make(chan bool, 0) // For mlr head, so a transformer can communicate it will disregard all // further input. It writes this back upstream, and that is passed back to // the record-reader which then stops reading input. This is necessary to // get quick response from, for example, mlr head -n 10 on input files with // millions or billions of records. - readerDownstreamDoneChannel := make(chan bool, 1) + readerDownstreamDoneChannel := make(chan bool, 0) // Start the reader, transformer, and writer. Let them run until fatal input // error or end-of-processing happens. bufferedOutputStream := bufio.NewWriter(outputStream) - go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) - go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, - writerChannel, options) - go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, - bufferedOutputStream, outputIsStdout) + if os.Getenv("MLR_BYPASS_CHAIN") == "true" { + // TODO: comment: for profiling + fmt.Fprintln(os.Stderr, "EXPERIMENTAL CHAIN BYPASS") + go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) + go output.ChannelWriter(readerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, + bufferedOutputStream, outputIsStdout) + } else { + go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) + go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, + writerChannel, options) + go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, + bufferedOutputStream, outputIsStdout) + } done := false for !done { diff --git a/todo.txt b/todo.txt index 0a8e0497e8..8ed729f574 100644 --- a/todo.txt +++ b/todo.txt @@ -36,9 +36,11 @@ PUNCHDOWN LIST - reads > everything else - reads and writes + ! fix mlr head -n 1 ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc + ? coalesce errchan & done-writing w/ Err to RAC, and close-chan *and* EOSMarker -- ? - checklist channelize all record-reader types - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly > needs factor for-loop to stateful so maybe not From 7f1aced97bbdc651d7f9fd1d38b79f1fb9792b48 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 20:23:37 -0500 Subject: [PATCH 10/28] cmd/mprof and cmd/mprof2 --- Makefile | 4 +- cmd/mprof/main.go | 679 +++++++++++++++++++++++++++++++++++++++++++++ cmd/mprof2/main.go | 382 +++++++++++++++++++++++++ 3 files changed, 1064 insertions(+), 1 deletion(-) create mode 100644 cmd/mprof/main.go create mode 100644 cmd/mprof2/main.go diff --git a/Makefile b/Makefile index 47c0ef76ff..0362f95b4b 100644 --- a/Makefile +++ b/Makefile @@ -75,6 +75,8 @@ mlr: go build github.com/johnkerl/miller/cmd/mlr mprof: go build github.com/johnkerl/miller/cmd/mprof +mprof2: + go build github.com/johnkerl/miller/cmd/mprof2 # Please see comments in ./create-release-tarball as well as # https://miller.readthedocs.io/en/latest/build/#creating-a-new-release-for-developers @@ -82,4 +84,4 @@ release_tarball: build check ./create-release-tarball # Go does its own dependency management, outside of make. -.PHONY: build mlr mprof check unit_test regression_test fmt dev +.PHONY: build mlr mprof mprof2 check unit_test regression_test fmt dev diff --git a/cmd/mprof/main.go b/cmd/mprof/main.go new file mode 100644 index 0000000000..1bd60daabf --- /dev/null +++ b/cmd/mprof/main.go @@ -0,0 +1,679 @@ +// Experiments in performance/profiling. +package main + +import ( + "bufio" + "container/list" + "fmt" + "io" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + "strings" + //"time" + + "github.com/pkg/profile" // for trace.out + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/input" + "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/types" +) + +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + if os.Getenv("MPROF_PPROF") != "" { + // profiling with cpu.pprof and go tool pprof -http=:8080 cpu.pprof + profFilename := "cpu.pprof" + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + fmt.Fprintf(os.Stderr, "go tool pprof -http=:8080 cpu.pprof\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + } + + if os.Getenv("MPROF_TRACE") != "" { + // tracing with trace.out and go tool trace trace.out + fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + } + + options := cli.DefaultOptions() + if os.Getenv("MPROF_JIT") != "" { + fmt.Fprintf(os.Stderr, "JIT ON\n") + types.SetInferrerStringOnly() + } else { + fmt.Fprintf(os.Stderr, "JIT OFF\n") + } + + filenames := os.Args[1:] + lib.InternalCodingErrorIf(len(filenames) != 1) + filename := filenames[0] + + err := Stream(filename, options, os.Stdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) + os.Exit(1) + } +} + +func getBatchSize() int { + m := 1 + sm := os.Getenv("MPROF_BATCH") + if sm != "" { + im, err := strconv.ParseInt(sm, 0, 64) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + m = int(im) + } + fmt.Fprintf(os.Stderr, "IBATCH %d\n", m) + return m +} + +// ================================================================ +type IRecordReader interface { + Read(ioChannel chan<- *list.List) error +} + +func Stream( + filename string, + options *cli.TOptions, + outputStream io.WriteCloser, +) error { + initialContext := types.NewContext() + + // Instantiate the record-reader + var recordReader IRecordReader + var err error + if os.Getenv("MPROF_PIPE") != "" { + fmt.Fprintf(os.Stderr, "PIPELINE ON\n") + recordReader, err = NewRecordReaderDKVPListPipelined(&options.ReaderOptions, filename, initialContext) + } else if os.Getenv("MPROF_CHAN") != "" { + fmt.Fprintf(os.Stderr, "CHAN ON\n") + recordReader, err = NewRecordReaderDKVPChanPipelined(&options.ReaderOptions, filename, initialContext) + } else { + fmt.Fprintf(os.Stderr, "PIPELINE OFF\n") + recordReader, err = NewRecordReaderDKVPNonPipelined(&options.ReaderOptions, filename, initialContext) + } + if err != nil { + return err + } + + // Instantiate the record-writer + recordWriter, err := NewRecordWriterDKVP2(&options.WriterOptions) + if err != nil { + return err + } + + ostream := bufio.NewWriter(os.Stdout) + defer ostream.Flush() + + ioChannel := make(chan *list.List, 1) + errorChannel := make(chan error, 1) + doneWritingChannel := make(chan bool, 1) + + go recordReader.Read(ioChannel) + go ChannelWriter(ioChannel, recordWriter, doneWritingChannel, ostream) + + done := false + for !done { + select { + case err := <-errorChannel: + ////fmt.Fprintf(os.Stderr, "ECHAN READ\n") + fmt.Fprintln(os.Stderr, "mlr", ": ", err) + os.Exit(1) + case _ = <-doneWritingChannel: + ////fmt.Fprintf(os.Stderr, "ZCHAN READ\n") + done = true + break + } + } + + return nil +} + +// ================================================================ + +type RecordReaderDKVPNonPipelined struct { + readerOptions *cli.TReaderOptions + filename string + initialContext *types.Context +} + +func NewRecordReaderDKVPNonPipelined( + readerOptions *cli.TReaderOptions, + filename string, + initialContext *types.Context, +) (*RecordReaderDKVPNonPipelined, error) { + return &RecordReaderDKVPNonPipelined{ + readerOptions: readerOptions, + filename: filename, + initialContext: initialContext, + }, nil +} + +func (reader *RecordReaderDKVPNonPipelined) Read( + inputChannel chan<- *list.List, +) error { + handle, err := lib.OpenFileForRead( + reader.filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + return err + } else { + reader.processHandle(handle, reader.filename, reader.initialContext, inputChannel) + handle.Close() + } + + return nil +} + +// ---------------------------------------------------------------- + +func (reader *RecordReaderDKVPNonPipelined) processHandle( + handle io.Reader, + filename string, + context *types.Context, + inputChannel chan<- *list.List, +) { + context.UpdateForStartOfFile(filename) + scanner := input.NewLineScanner(handle, reader.readerOptions.IRS) + + recordsAndContexts := list.New() + + m := getBatchSize() + i := 0 + for scanner.Scan() { + i += 1 + + line := scanner.Text() + + record := reader.recordFromDKVPLine(line) + context.UpdateForInputRecord() + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) + + if i%m == 0 { + inputChannel <- recordsAndContexts + recordsAndContexts = list.New() + } + + } + if recordsAndContexts.Len() > 0 { + inputChannel <- recordsAndContexts + } + inputChannel <- nil // end-of-stream marker +} + +func (reader *RecordReaderDKVPNonPipelined) recordFromDKVPLine( + line string, +) *types.Mlrmap { + record := types.NewMlrmap() + + var pairs []string + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + pairs = lib.SplitString(line, reader.readerOptions.IFS) + } else { + pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + for i, pair := range pairs { + var kv []string + if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex + kv = strings.SplitN(line, reader.readerOptions.IPS, 2) + } else { + kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) + } + + if len(kv) == 0 { + // Ignore. This is expected when splitting with repeated IFS. + } else if len(kv) == 1 { + // E.g the pair has no equals sign: "a" rather than "a=1" or + // "a=". Here we use the positional index as the key. This way + // DKVP is a generalization of NIDX. + key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + value := types.MlrvalFromInferredTypeForDataFiles(kv[0]) + record.PutReference(key, value) + } else { + key := kv[0] + value := types.MlrvalFromInferredTypeForDataFiles(kv[1]) + record.PutReference(key, value) + } + } + return record +} + +// ================================================================ + +type RecordReaderDKVPListPipelined struct { + readerOptions *cli.TReaderOptions + filename string + initialContext *types.Context +} + +func NewRecordReaderDKVPListPipelined( + readerOptions *cli.TReaderOptions, + filename string, + initialContext *types.Context, +) (*RecordReaderDKVPListPipelined, error) { + return &RecordReaderDKVPListPipelined{ + readerOptions: readerOptions, + filename: filename, + initialContext: initialContext, + }, nil +} + +func (reader *RecordReaderDKVPListPipelined) Read( + inputChannel chan<- *list.List, +) error { + handle, err := lib.OpenFileForRead( + reader.filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + return err + } else { + reader.processHandle(handle, reader.filename, reader.initialContext, inputChannel) + handle.Close() + } + + eom := types.NewEndOfStreamMarker(reader.initialContext) + leom := list.New() + leom.PushBack(eom) + inputChannel <- leom + ////fmt.Fprintf(os.Stderr, "IOCHAN WRITE EOM\n") + return nil +} + +func lineProvider( + lineScanner *bufio.Scanner, + linesChannel chan<- *list.List, +) { + + lines := list.New() + + m := getBatchSize() + i := 0 + for lineScanner.Scan() { + i += 1 + line := lineScanner.Text() + lines.PushBack(line) + if i%m == 0 { + linesChannel <- lines + lines = list.New() + } + } + if lines.Len() > 0 { + linesChannel <- lines + } + linesChannel <- nil // end-of-stream marker +} + +func (reader *RecordReaderDKVPListPipelined) processHandle( + handle io.Reader, + filename string, + context *types.Context, + inputChannel chan<- *list.List, +) { + context.UpdateForStartOfFile(filename) + + lineScanner := input.NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan *list.List, 1) + + go lineProvider(lineScanner, linesChannel) + + for { + lines := <-linesChannel + if lines == nil { + break + } + recordsAndContexts := list.New() + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) + record := reader.recordFromDKVPLine(line) + context.UpdateForInputRecord() + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) + } + inputChannel <- recordsAndContexts + } +} + +func (reader *RecordReaderDKVPListPipelined) recordFromDKVPLine( + line string, +) *types.Mlrmap { + record := types.NewMlrmap() + + var pairs []string + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + pairs = lib.SplitString(line, reader.readerOptions.IFS) + } else { + pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + for i, pair := range pairs { + var kv []string + if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex + kv = strings.SplitN(line, reader.readerOptions.IPS, 2) + } else { + kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) + } + + if len(kv) == 0 { + // Ignore. This is expected when splitting with repeated IFS. + } else if len(kv) == 1 { + // E.g the pair has no equals sign: "a" rather than "a=1" or + // "a=". Here we use the positional index as the key. This way + // DKVP is a generalization of NIDX. + key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + value := types.MlrvalFromInferredTypeForDataFiles(kv[0]) + record.PutReference(key, value) + } else { + key := kv[0] + value := types.MlrvalFromInferredTypeForDataFiles(kv[1]) + record.PutReference(key, value) + } + } + return record +} + +// ================================================================ + +type RecordReaderDKVPChanPipelined struct { + readerOptions *cli.TReaderOptions + filename string + initialContext *types.Context +} + +func NewRecordReaderDKVPChanPipelined( + readerOptions *cli.TReaderOptions, + filename string, + initialContext *types.Context, +) (*RecordReaderDKVPChanPipelined, error) { + return &RecordReaderDKVPChanPipelined{ + readerOptions: readerOptions, + filename: filename, + initialContext: initialContext, + }, nil +} + +func (reader *RecordReaderDKVPChanPipelined) Read( + inputChannel chan<- *list.List, +) error { + handle, err := lib.OpenFileForRead( + reader.filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + return err + } else { + reader.processHandle(handle, reader.filename, reader.initialContext, inputChannel) + handle.Close() + } + + eom := types.NewEndOfStreamMarker(reader.initialContext) + leom := list.New() + leom.PushBack(eom) + inputChannel <- leom + ////fmt.Fprintf(os.Stderr, "IOCHAN WRITE EOM\n") + return nil +} + +func chanProvider( + lineScanner *bufio.Scanner, + linesChannel chan<- string, +) { + for lineScanner.Scan() { + linesChannel <- lineScanner.Text() + } + close(linesChannel) // end-of-stream marker +} + +// TODO: comment copiously we're trying to handle slow/fast/short/long +// reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderDKVPChanPipelined) getRecordBatch( + linesChannel <-chan string, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + //fmt.Printf("GRB ENTER\n") + recordsAndContexts = list.New() + eof = false + + for i := 0; i < maxBatchSize; i++ { + //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) + if len(linesChannel) == 0 && i > 0 { + //fmt.Println(" .. BREAK") + break + } + //fmt.Println(" .. B:BLOCK") + line, more := <-linesChannel + //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) + if !more { + eof = true + break + } + + record := reader.recordFromDKVPLine(line) + context.UpdateForInputRecord() + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) + } + + //fmt.Printf("GRB EXIT\n") + return recordsAndContexts, eof +} + +func (reader *RecordReaderDKVPChanPipelined) processHandle( + handle io.Reader, + filename string, + context *types.Context, + inputChannel chan<- *list.List, +) { + context.UpdateForStartOfFile(filename) + m := getBatchSize() + + lineScanner := input.NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan string, m) + go chanProvider(lineScanner, linesChannel) + + eof := false + for !eof { + var recordsAndContexts *list.List + recordsAndContexts, eof = reader.getRecordBatch(linesChannel, m, context) + //fmt.Fprintf(os.Stderr, "GOT RECORD BATCH OF LENGTH %d\n", recordsAndContexts.Len()) + inputChannel <- recordsAndContexts + } +} + +func (reader *RecordReaderDKVPChanPipelined) recordFromDKVPLine( + line string, +) *types.Mlrmap { + record := types.NewMlrmap() + + var pairs []string + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + pairs = lib.SplitString(line, reader.readerOptions.IFS) + } else { + pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + for i, pair := range pairs { + var kv []string + if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex + kv = strings.SplitN(line, reader.readerOptions.IPS, 2) + } else { + kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) + } + + if len(kv) == 0 { + // Ignore. This is expected when splitting with repeated IFS. + } else if len(kv) == 1 { + // E.g the pair has no equals sign: "a" rather than "a=1" or + // "a=". Here we use the positional index as the key. This way + // DKVP is a generalization of NIDX. + key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + value := types.MlrvalFromInferredTypeForDataFiles(kv[0]) + record.PutReference(key, value) + } else { + key := kv[0] + value := types.MlrvalFromInferredTypeForDataFiles(kv[1]) + record.PutReference(key, value) + } + } + return record +} + +// ================================================================ +func ChannelWriter( + outputChannel <-chan *list.List, + recordWriter *RecordWriterDKVP2, + doneChannel chan<- bool, + ostream *bufio.Writer, +) { + for { + recordsAndContexts := <-outputChannel + if recordsAndContexts != nil { + //fmt.Fprintf(os.Stderr, "IOCHAN READ BATCH LEN %d\n", recordsAndContexts.Len()) + } + if recordsAndContexts == nil { + //fmt.Fprintf(os.Stderr, "IOCHAN READ EOS\n") + doneChannel <- true + break + } + + for e := recordsAndContexts.Front(); e != nil; e = e.Next() { + recordAndContext := e.Value.(*types.RecordAndContext) + + // Three things can come through: + // * End-of-stream marker + // * Non-nil records to be printed + // * Strings to be printed from put/filter DSL print/dump/etc + // statements. They are handled here rather than fmt.Println directly + // in the put/filter handlers since we want all print statements and + // record-output to be in the same goroutine, for deterministic + // output ordering. + if !recordAndContext.EndOfStream { + record := recordAndContext.Record + if record != nil { + recordWriter.Write(record, ostream) + } + + outputString := recordAndContext.OutputString + if outputString != "" { + fmt.Print(outputString) + } + + } else { + // Let the record-writers drain their output, if they have any + // queued up. For example, PPRINT needs to see all same-schema + // records before printing any, since it needs to compute max width + // down columns. + recordWriter.Write(nil, ostream) + doneChannel <- true + ////fmt.Fprintf(os.Stderr, "ZCHAN WRITE\n") + return + } + } + } +} + +// ================================================================ + +type RecordWriterDKVP2 struct { + writerOptions *cli.TWriterOptions + buffer bool +} + +func NewRecordWriterDKVP2(writerOptions *cli.TWriterOptions) (*RecordWriterDKVP2, error) { + buffer := false + if os.Getenv("MPROF_BUFFER") != "" { + fmt.Fprintf(os.Stderr, "BUFFER ON\n") + buffer = true + } else { + fmt.Fprintf(os.Stderr, "BUFFER OFF\n") + } + return &RecordWriterDKVP2{ + writerOptions: writerOptions, + buffer: buffer, + }, nil +} + +func (writer *RecordWriterDKVP2) Write( + outrec *types.Mlrmap, + ostream *bufio.Writer, +) { + // End of record stream: nothing special for this output format + if outrec == nil { + return + } + + for pe := outrec.Head; pe != nil; pe = pe.Next { + ostream.WriteString(pe.Key) + ostream.WriteString(writer.writerOptions.OPS) + ostream.WriteString(pe.Value.String()) + if pe.Next != nil { + ostream.WriteString(writer.writerOptions.OFS) + } + } + ostream.WriteString(writer.writerOptions.ORS) + if !writer.buffer { + ostream.Flush() + } +} + +// The time.After adds too much overhead, even when there is data +// available very quickly and the timeout is never reached. :( +//select { +//case line, more = <-linesChannel: +// if !more { +// done = true +// break +// } +//case <-time.After(5 * time.Second): +// fmt.Println("WAIT") +// continue +//} diff --git a/cmd/mprof2/main.go b/cmd/mprof2/main.go new file mode 100644 index 0000000000..2b6b0f04ce --- /dev/null +++ b/cmd/mprof2/main.go @@ -0,0 +1,382 @@ +// Experiments in performance/profiling. +package main + +import ( + "bufio" + "container/list" + "fmt" + "io" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + "strings" + //"time" + + "github.com/pkg/profile" // for trace.out + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/input" + "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/types" +) + +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + if os.Getenv("MPROF_PPROF") != "" { + // profiling with cpu.pprof and go tool pprof -http=:8080 cpu.pprof + profFilename := "cpu.pprof" + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + fmt.Fprintf(os.Stderr, "go tool pprof -http=:8080 cpu.pprof\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + } + + if os.Getenv("MPROF_TRACE") != "" { + // tracing with trace.out and go tool trace trace.out + fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + } + + options := cli.DefaultOptions() + types.SetInferrerStringOnly() + + filenames := os.Args[1:] + lib.InternalCodingErrorIf(len(filenames) != 1) + filename := filenames[0] + + err := Stream(filename, options, os.Stdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) + os.Exit(1) + } +} + +func getBatchSize() int { + return 1000 +} + +// ================================================================ +type IRecordReader interface { + Read(ioChannel chan<- *list.List) error +} + +func Stream( + filename string, + options *cli.TOptions, + outputStream io.WriteCloser, +) error { + initialContext := types.NewContext() + + // Instantiate the record-reader + recordReader, err := NewRecordReaderDKVPChanPipelined(&options.ReaderOptions, filename, initialContext) + if err != nil { + return err + } + + // Instantiate the record-writer + recordWriter, err := NewRecordWriterDKVP2(&options.WriterOptions) + if err != nil { + return err + } + + ostream := bufio.NewWriter(os.Stdout) + defer ostream.Flush() + + ioChannel := make(chan *list.List, 1) + errorChannel := make(chan error, 1) + doneWritingChannel := make(chan bool, 1) + + go recordReader.Read(ioChannel) + go ChannelWriter(ioChannel, recordWriter, doneWritingChannel, ostream) + + done := false + for !done { + select { + case err := <-errorChannel: + ////fmt.Fprintf(os.Stderr, "ECHAN READ\n") + fmt.Fprintln(os.Stderr, "mlr", ": ", err) + os.Exit(1) + case _ = <-doneWritingChannel: + ////fmt.Fprintf(os.Stderr, "ZCHAN READ\n") + done = true + break + } + } + + return nil +} + +// ================================================================ + +type RecordReaderDKVPChanPipelined struct { + readerOptions *cli.TReaderOptions + filename string + initialContext *types.Context +} + +func NewRecordReaderDKVPChanPipelined( + readerOptions *cli.TReaderOptions, + filename string, + initialContext *types.Context, +) (*RecordReaderDKVPChanPipelined, error) { + return &RecordReaderDKVPChanPipelined{ + readerOptions: readerOptions, + filename: filename, + initialContext: initialContext, + }, nil +} + +func (reader *RecordReaderDKVPChanPipelined) Read( + inputChannel chan<- *list.List, +) error { + handle, err := lib.OpenFileForRead( + reader.filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + return err + } else { + reader.processHandle(handle, reader.filename, reader.initialContext, inputChannel) + handle.Close() + } + + eom := types.NewEndOfStreamMarker(reader.initialContext) + leom := list.New() + leom.PushBack(eom) + inputChannel <- leom + ////fmt.Fprintf(os.Stderr, "IOCHAN WRITE EOM\n") + return nil +} + +func chanProvider( + lineScanner *bufio.Scanner, + linesChannel chan<- string, +) { + for lineScanner.Scan() { + linesChannel <- lineScanner.Text() + } + close(linesChannel) // end-of-stream marker +} + +// TODO: comment copiously we're trying to handle slow/fast/short/long +// reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderDKVPChanPipelined) getRecordBatch( + linesChannel <-chan string, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + //fmt.Printf("GRB ENTER\n") + recordsAndContexts = list.New() + eof = false + + for i := 0; i < maxBatchSize; i++ { + //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) + if len(linesChannel) == 0 && i > 0 { + //fmt.Println(" .. BREAK") + break + } + //fmt.Println(" .. B:BLOCK") + line, more := <-linesChannel + //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) + if !more { + eof = true + break + } + + record := reader.recordFromDKVPLine(line) + context.UpdateForInputRecord() + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) + } + + //fmt.Printf("GRB EXIT\n") + return recordsAndContexts, eof +} + +func (reader *RecordReaderDKVPChanPipelined) processHandle( + handle io.Reader, + filename string, + context *types.Context, + inputChannel chan<- *list.List, +) { + context.UpdateForStartOfFile(filename) + m := getBatchSize() + + lineScanner := input.NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan string, m) + go chanProvider(lineScanner, linesChannel) + + eof := false + for !eof { + var recordsAndContexts *list.List + recordsAndContexts, eof = reader.getRecordBatch(linesChannel, m, context) + //fmt.Fprintf(os.Stderr, "GOT RECORD BATCH OF LENGTH %d\n", recordsAndContexts.Len()) + inputChannel <- recordsAndContexts + } +} + +func (reader *RecordReaderDKVPChanPipelined) recordFromDKVPLine( + line string, +) *types.Mlrmap { + record := types.NewMlrmap() + + var pairs []string + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + pairs = lib.SplitString(line, reader.readerOptions.IFS) + } else { + pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + for i, pair := range pairs { + var kv []string + if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex + kv = strings.SplitN(line, reader.readerOptions.IPS, 2) + } else { + kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) + } + + if len(kv) == 0 { + // Ignore. This is expected when splitting with repeated IFS. + } else if len(kv) == 1 { + // E.g the pair has no equals sign: "a" rather than "a=1" or + // "a=". Here we use the positional index as the key. This way + // DKVP is a generalization of NIDX. + key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + value := types.MlrvalFromInferredTypeForDataFiles(kv[0]) + record.PutReference(key, value) + } else { + key := kv[0] + value := types.MlrvalFromInferredTypeForDataFiles(kv[1]) + record.PutReference(key, value) + } + } + return record +} + +// ================================================================ +func ChannelWriter( + outputChannel <-chan *list.List, + recordWriter *RecordWriterDKVP2, + doneChannel chan<- bool, + ostream *bufio.Writer, +) { + for { + recordsAndContexts := <-outputChannel + if recordsAndContexts != nil { + //fmt.Fprintf(os.Stderr, "IOCHAN READ BATCH LEN %d\n", recordsAndContexts.Len()) + } + if recordsAndContexts == nil { + //fmt.Fprintf(os.Stderr, "IOCHAN READ EOS\n") + doneChannel <- true + break + } + + for e := recordsAndContexts.Front(); e != nil; e = e.Next() { + recordAndContext := e.Value.(*types.RecordAndContext) + + // Three things can come through: + // * End-of-stream marker + // * Non-nil records to be printed + // * Strings to be printed from put/filter DSL print/dump/etc + // statements. They are handled here rather than fmt.Println directly + // in the put/filter handlers since we want all print statements and + // record-output to be in the same goroutine, for deterministic + // output ordering. + if !recordAndContext.EndOfStream { + record := recordAndContext.Record + if record != nil { + recordWriter.Write(record, ostream) + } + + outputString := recordAndContext.OutputString + if outputString != "" { + fmt.Print(outputString) + } + + } else { + // Let the record-writers drain their output, if they have any + // queued up. For example, PPRINT needs to see all same-schema + // records before printing any, since it needs to compute max width + // down columns. + recordWriter.Write(nil, ostream) + doneChannel <- true + ////fmt.Fprintf(os.Stderr, "ZCHAN WRITE\n") + return + } + } + } +} + +// ================================================================ + +type RecordWriterDKVP2 struct { + writerOptions *cli.TWriterOptions + buffer bool +} + +func NewRecordWriterDKVP2(writerOptions *cli.TWriterOptions) (*RecordWriterDKVP2, error) { + return &RecordWriterDKVP2{ + writerOptions: writerOptions, + buffer: true, + }, nil +} + +func (writer *RecordWriterDKVP2) Write( + outrec *types.Mlrmap, + ostream *bufio.Writer, +) { + // End of record stream: nothing special for this output format + if outrec == nil { + return + } + + for pe := outrec.Head; pe != nil; pe = pe.Next { + ostream.WriteString(pe.Key) + ostream.WriteString(writer.writerOptions.OPS) + ostream.WriteString(pe.Value.String()) + if pe.Next != nil { + ostream.WriteString(writer.writerOptions.OFS) + } + } + ostream.WriteString(writer.writerOptions.ORS) + if !writer.buffer { + ostream.Flush() + } +} From 6ad475bfc0d2b4b7592f2ca8c5e8ee1e1c703569 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 21:13:36 -0500 Subject: [PATCH 11/28] cmd/mprof3 and cmd/mprof4 --- .vimrc | 1 + Makefile | 7 +- cmd/mprof2/main.go | 47 +--- cmd/mprof3/main.go | 306 +++++++++++++++++++++++ cmd/mprof4/main.go | 130 ++++++++++ internal/pkg/input/record_reader_dkvp.go | 17 +- todo.txt | 7 + 7 files changed, 466 insertions(+), 49 deletions(-) create mode 100644 cmd/mprof3/main.go create mode 100644 cmd/mprof4/main.go diff --git a/.vimrc b/.vimrc index 3bb1e3df89..ceb2dc6f50 100644 --- a/.vimrc +++ b/.vimrc @@ -1 +1,2 @@ map \d :w:!clear;echo Building ...; echo; make build +map \f :w:!clear;echo Building ...; echo; make mall diff --git a/Makefile b/Makefile index 0362f95b4b..f2864ec59d 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,11 @@ mprof: go build github.com/johnkerl/miller/cmd/mprof mprof2: go build github.com/johnkerl/miller/cmd/mprof2 +mprof3: + go build github.com/johnkerl/miller/cmd/mprof3 +mprof4: + go build github.com/johnkerl/miller/cmd/mprof4 +mall: mprof4 mprof3 mprof2 mprof mlr # Please see comments in ./create-release-tarball as well as # https://miller.readthedocs.io/en/latest/build/#creating-a-new-release-for-developers @@ -84,4 +89,4 @@ release_tarball: build check ./create-release-tarball # Go does its own dependency management, outside of make. -.PHONY: build mlr mprof mprof2 check unit_test regression_test fmt dev +.PHONY: build mlr mprof mprof2 mprof3 mprof4 check unit_test regression_test fmt dev diff --git a/cmd/mprof2/main.go b/cmd/mprof2/main.go index 2b6b0f04ce..1619bb2e83 100644 --- a/cmd/mprof2/main.go +++ b/cmd/mprof2/main.go @@ -20,6 +20,7 @@ import ( "github.com/johnkerl/miller/internal/pkg/input" "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/types" + "github.com/johnkerl/miller/internal/pkg/output" ) func main() { @@ -106,7 +107,7 @@ func Stream( } // Instantiate the record-writer - recordWriter, err := NewRecordWriterDKVP2(&options.WriterOptions) + recordWriter, err := output.NewRecordWriterDKVP(&options.WriterOptions) if err != nil { return err } @@ -293,10 +294,11 @@ func (reader *RecordReaderDKVPChanPipelined) recordFromDKVPLine( // ================================================================ func ChannelWriter( outputChannel <-chan *list.List, - recordWriter *RecordWriterDKVP2, + recordWriter output.IRecordWriter, doneChannel chan<- bool, ostream *bufio.Writer, ) { + outputIsStdout := true for { recordsAndContexts := <-outputChannel if recordsAndContexts != nil { @@ -322,7 +324,7 @@ func ChannelWriter( if !recordAndContext.EndOfStream { record := recordAndContext.Record if record != nil { - recordWriter.Write(record, ostream) + recordWriter.Write(record, ostream, outputIsStdout) } outputString := recordAndContext.OutputString @@ -335,7 +337,7 @@ func ChannelWriter( // queued up. For example, PPRINT needs to see all same-schema // records before printing any, since it needs to compute max width // down columns. - recordWriter.Write(nil, ostream) + recordWriter.Write(nil, ostream, outputIsStdout) doneChannel <- true ////fmt.Fprintf(os.Stderr, "ZCHAN WRITE\n") return @@ -343,40 +345,3 @@ func ChannelWriter( } } } - -// ================================================================ - -type RecordWriterDKVP2 struct { - writerOptions *cli.TWriterOptions - buffer bool -} - -func NewRecordWriterDKVP2(writerOptions *cli.TWriterOptions) (*RecordWriterDKVP2, error) { - return &RecordWriterDKVP2{ - writerOptions: writerOptions, - buffer: true, - }, nil -} - -func (writer *RecordWriterDKVP2) Write( - outrec *types.Mlrmap, - ostream *bufio.Writer, -) { - // End of record stream: nothing special for this output format - if outrec == nil { - return - } - - for pe := outrec.Head; pe != nil; pe = pe.Next { - ostream.WriteString(pe.Key) - ostream.WriteString(writer.writerOptions.OPS) - ostream.WriteString(pe.Value.String()) - if pe.Next != nil { - ostream.WriteString(writer.writerOptions.OFS) - } - } - ostream.WriteString(writer.writerOptions.ORS) - if !writer.buffer { - ostream.Flush() - } -} diff --git a/cmd/mprof3/main.go b/cmd/mprof3/main.go new file mode 100644 index 0000000000..fe30e8c527 --- /dev/null +++ b/cmd/mprof3/main.go @@ -0,0 +1,306 @@ +// Experiments in performance/profiling. +package main + +import ( + "bufio" + "container/list" + "fmt" + "io" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + "strings" + + "github.com/pkg/profile" // for trace.out + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/input" + "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/output" + "github.com/johnkerl/miller/internal/pkg/types" +) + +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + if os.Getenv("MPROF_PPROF") != "" { + // profiling with cpu.pprof and go tool pprof -http=:8080 cpu.pprof + profFilename := "cpu.pprof" + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + fmt.Fprintf(os.Stderr, "go tool pprof -http=:8080 cpu.pprof\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + } + + if os.Getenv("MPROF_TRACE") != "" { + // tracing with trace.out and go tool trace trace.out + fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + } + + options := cli.DefaultOptions() + types.SetInferrerStringOnly() + + filenames := os.Args[1:] + lib.InternalCodingErrorIf(len(filenames) != 1) + filename := filenames[0] + + err := Stream(filename, options, os.Stdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) + os.Exit(1) + } +} + +func getBatchSize() int { + return 1000 +} + +// ================================================================ +type IRecordReader interface { + Read(ioChannel chan<- *list.List) error +} + +func Stream( + filename string, + options *cli.TOptions, + outputStream io.WriteCloser, +) error { + initialContext := types.NewContext() + + // Instantiate the record-reader + recordReader, err := NewRecordReaderDKVPChanPipelined(&options.ReaderOptions, filename, initialContext) + if err != nil { + return err + } + + // Instantiate the record-writer + recordWriter, err := output.NewRecordWriterDKVP(&options.WriterOptions) + if err != nil { + return err + } + + bufferedOutputStream := bufio.NewWriter(os.Stdout) + defer bufferedOutputStream.Flush() + + ioChannel := make(chan *list.List, 1) + errorChannel := make(chan error, 1) + doneWritingChannel := make(chan bool, 1) + + go recordReader.Read(ioChannel) + go output.ChannelWriter(ioChannel, recordWriter, &options.WriterOptions, doneWritingChannel, + bufferedOutputStream, true) + + done := false + for !done { + select { + case err := <-errorChannel: + ////fmt.Fprintf(os.Stderr, "ECHAN READ\n") + fmt.Fprintln(os.Stderr, "mlr", ": ", err) + os.Exit(1) + case _ = <-doneWritingChannel: + ////fmt.Fprintf(os.Stderr, "ZCHAN READ\n") + done = true + break + } + } + + return nil +} + +// ================================================================ + +type RecordReaderDKVPChanPipelined struct { + readerOptions *cli.TReaderOptions + filename string + initialContext *types.Context +} + +func NewRecordReaderDKVPChanPipelined( + readerOptions *cli.TReaderOptions, + filename string, + initialContext *types.Context, +) (*RecordReaderDKVPChanPipelined, error) { + return &RecordReaderDKVPChanPipelined{ + readerOptions: readerOptions, + filename: filename, + initialContext: initialContext, + }, nil +} + +func (reader *RecordReaderDKVPChanPipelined) Read( + readerChannel chan<- *list.List, +) error { + handle, err := lib.OpenFileForRead( + reader.filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + return err + } else { + reader.processHandle(handle, reader.filename, reader.initialContext, readerChannel) + handle.Close() + } + + eom := types.NewEndOfStreamMarker(reader.initialContext) + leom := list.New() + leom.PushBack(eom) + readerChannel <- leom + ////fmt.Fprintf(os.Stderr, "IOCHAN WRITE EOM\n") + return nil +} + +func provideChannelizedLines( + lineScanner *bufio.Scanner, + linesChannel chan<- string, +) { + for lineScanner.Scan() { + linesChannel <- lineScanner.Text() + } + close(linesChannel) // end-of-stream marker +} + +func (reader *RecordReaderDKVPChanPipelined) processHandle( + handle io.Reader, + filename string, + context *types.Context, + readerChannel chan<- *list.List, +) { + context.UpdateForStartOfFile(filename) + m := getBatchSize() + + lineScanner := input.NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan string, m) + go provideChannelizedLines(lineScanner, linesChannel) + + eof := false + for !eof { + var recordsAndContexts *list.List + recordsAndContexts, eof = reader.getRecordBatch(linesChannel, m, context) + //fmt.Fprintf(os.Stderr, "GOT RECORD BATCH OF LENGTH %d\n", recordsAndContexts.Len()) + readerChannel <- recordsAndContexts + } +} + +// TODO: comment copiously we're trying to handle slow/fast/short/long +// reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderDKVPChanPipelined) getRecordBatch( + linesChannel <-chan string, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + //fmt.Printf("GRB ENTER\n") + recordsAndContexts = list.New() + eof = false + + for i := 0; i < maxBatchSize; i++ { + //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) + if len(linesChannel) == 0 && i > 0 { + //fmt.Println(" .. BREAK") + break + } + //fmt.Println(" .. B:BLOCK") + line, more := <-linesChannel + //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) + if !more { + eof = true + break + } + + // Check for comments-in-data feature + // TODO: funcptr this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data + } + } + + record := reader.recordFromDKVPLine(line) + context.UpdateForInputRecord() + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) + } + + //fmt.Printf("GRB EXIT\n") + return recordsAndContexts, eof +} + +func (reader *RecordReaderDKVPChanPipelined) recordFromDKVPLine( + line string, +) *types.Mlrmap { + record := types.NewMlrmapAsRecord() + + var pairs []string + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + pairs = lib.SplitString(line, reader.readerOptions.IFS) + } else { + pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + for i, pair := range pairs { + var kv []string + if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex + kv = strings.SplitN(line, reader.readerOptions.IPS, 2) + } else { + kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) + } + + if len(kv) == 0 { + // Ignore. This is expected when splitting with repeated IFS. + } else if len(kv) == 1 { + // E.g the pair has no equals sign: "a" rather than "a=1" or + // "a=". Here we use the positional index as the key. This way + // DKVP is a generalization of NIDX. + key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + value := types.MlrvalFromInferredTypeForDataFiles(kv[0]) + record.PutReference(key, value) + } else { + key := kv[0] + value := types.MlrvalFromInferredTypeForDataFiles(kv[1]) + record.PutReference(key, value) + } + } + return record +} diff --git a/cmd/mprof4/main.go b/cmd/mprof4/main.go new file mode 100644 index 0000000000..bf0be3651d --- /dev/null +++ b/cmd/mprof4/main.go @@ -0,0 +1,130 @@ +// Experiments in performance/profiling. +package main + +import ( + "bufio" + "container/list" + "fmt" + "io" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + + "github.com/pkg/profile" // for trace.out + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/input" + "github.com/johnkerl/miller/internal/pkg/output" + "github.com/johnkerl/miller/internal/pkg/types" +) + +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + if os.Getenv("MPROF_PPROF") != "" { + // profiling with cpu.pprof and go tool pprof -http=:8080 cpu.pprof + profFilename := "cpu.pprof" + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + fmt.Fprintf(os.Stderr, "go tool pprof -http=:8080 cpu.pprof\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + } + + if os.Getenv("MPROF_TRACE") != "" { + // tracing with trace.out and go tool trace trace.out + fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + } + + options := cli.DefaultOptions() + types.SetInferrerStringOnly() + + filenames := os.Args[1:] + + err := Stream(filenames, options, os.Stdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) + os.Exit(1) + } +} + +func getBatchSize() int { + return 1000 +} + +func Stream( + filenames []string, + options *cli.TOptions, + outputStream io.WriteCloser, +) error { + initialContext := types.NewContext() + + // Instantiate the record-reader + recordReader, err := input.NewRecordReaderDKVP(&options.ReaderOptions, getBatchSize()) + if err != nil { + return err + } + + // Instantiate the record-writer + recordWriter, err := output.NewRecordWriterDKVP(&options.WriterOptions) + if err != nil { + return err + } + + bufferedOutputStream := bufio.NewWriter(os.Stdout) + defer bufferedOutputStream.Flush() + + ioChannel := make(chan *list.List, 1) + downstreamDoneChannel := make(chan bool, 0) + errorChannel := make(chan error, 1) + doneWritingChannel := make(chan bool, 1) + + go recordReader.Read(filenames, *initialContext, ioChannel, errorChannel, downstreamDoneChannel) + go output.ChannelWriter(ioChannel, recordWriter, &options.WriterOptions, doneWritingChannel, + bufferedOutputStream, true) + + done := false + for !done { + select { + case err := <-errorChannel: + fmt.Fprintln(os.Stderr, "mlr", ": ", err) + os.Exit(1) + case _ = <-doneWritingChannel: + done = true + break + } + } + + return nil +} diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index 7c05d3d96e..b73df0b2b5 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -236,14 +236,17 @@ func (reader *RecordReaderDKVP) getRecordBatch( } // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue + // TODO: funcptr this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data } - // else comments are data } record := reader.recordFromDKVPLine(line) diff --git a/todo.txt b/todo.txt index 8ed729f574..b72d843103 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,10 @@ ================================================================ PUNCHDOWN LIST +uu 1.5514 +uu2 1.8186 +uu3 2.2992 + * perf wup @ rgp.md * perf: o go tool pprof -http=:8080 cpu.pprof @@ -67,6 +71,9 @@ PUNCHDOWN LIST > how to structure this for the transformers? > do that after hiding ! have an extra eye on CSV-reader perf + ? why mprof,mprof2 record-writer 1.5486, 1.9036 + ? double-check flush-handling + - funcptr around comment-handling stuff - mprof split-reader getenv something - flags for mlr.pprof & trace.out to CLI - hide app-level scan/format under sys-level read/write: also batched From b6ab5d982f7b75b7097535e77d70b0c5cbdc2f19 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 8 Dec 2021 23:20:46 -0500 Subject: [PATCH 12/28] narrowed in on regexp-splitting on IFS/IPS as perf-hit --- Makefile | 6 ++++-- cmd/mprof4/main.go | 6 +++--- internal/pkg/input/record_reader_dkvp.go | 1 + internal/pkg/stream/stream.go | 8 ++++---- todo.txt | 5 ++--- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index f2864ec59d..276dcae214 100644 --- a/Makefile +++ b/Makefile @@ -81,7 +81,9 @@ mprof3: go build github.com/johnkerl/miller/cmd/mprof3 mprof4: go build github.com/johnkerl/miller/cmd/mprof4 -mall: mprof4 mprof3 mprof2 mprof mlr +mprof5: + go build github.com/johnkerl/miller/cmd/mprof5 +mall: mprof5 mprof4 mprof3 mprof2 mprof mlr # Please see comments in ./create-release-tarball as well as # https://miller.readthedocs.io/en/latest/build/#creating-a-new-release-for-developers @@ -89,4 +91,4 @@ release_tarball: build check ./create-release-tarball # Go does its own dependency management, outside of make. -.PHONY: build mlr mprof mprof2 mprof3 mprof4 check unit_test regression_test fmt dev +.PHONY: build mlr mprof mprof2 mprof3 mprof4 mprof5 check unit_test regression_test fmt dev diff --git a/cmd/mprof4/main.go b/cmd/mprof4/main.go index bf0be3651d..88828b510b 100644 --- a/cmd/mprof4/main.go +++ b/cmd/mprof4/main.go @@ -80,7 +80,7 @@ func main() { } func getBatchSize() int { - return 1000 + return 500 } func Stream( @@ -106,11 +106,11 @@ func Stream( defer bufferedOutputStream.Flush() ioChannel := make(chan *list.List, 1) - downstreamDoneChannel := make(chan bool, 0) + readerDownstreamDoneChannel := make(chan bool, 0) errorChannel := make(chan error, 1) doneWritingChannel := make(chan bool, 1) - go recordReader.Read(filenames, *initialContext, ioChannel, errorChannel, downstreamDoneChannel) + go recordReader.Read(filenames, *initialContext, ioChannel, errorChannel, readerDownstreamDoneChannel) go output.ChannelWriter(ioChannel, recordWriter, &options.WriterOptions, doneWritingChannel, bufferedOutputStream, true) diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index b73df0b2b5..5665b0c018 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -265,6 +265,7 @@ func (reader *RecordReaderDKVP) recordFromDKVPLine( record := types.NewMlrmapAsRecord() var pairs []string + // TODO: func-pointer this away if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex pairs = lib.SplitString(line, reader.readerOptions.IFS) } else { diff --git a/internal/pkg/stream/stream.go b/internal/pkg/stream/stream.go index 93e4b168df..fcf40d63c9 100644 --- a/internal/pkg/stream/stream.go +++ b/internal/pkg/stream/stream.go @@ -63,14 +63,14 @@ func Stream( } // Set up the reader-to-transformer and transformer-to-writer channels. - readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext - writerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext + readerChannel := make(chan *list.List, 1) // list of *types.RecordAndContext + writerChannel := make(chan *list.List, 1) // list of *types.RecordAndContext // We're done when a fatal error is registered on input (file not found, // etc) or when the record-writer has written all its output. We use // channels to communicate both of these conditions. - errorChannel := make(chan error, 0) - doneWritingChannel := make(chan bool, 0) + errorChannel := make(chan error, 1) + doneWritingChannel := make(chan bool, 1) // For mlr head, so a transformer can communicate it will disregard all // further input. It writes this back upstream, and that is passed back to diff --git a/todo.txt b/todo.txt index b72d843103..9ac01a4cb0 100644 --- a/todo.txt +++ b/todo.txt @@ -1,9 +1,7 @@ ================================================================ PUNCHDOWN LIST -uu 1.5514 -uu2 1.8186 -uu3 2.2992 +! mlr --csv sort -f shape $mlds/example.csv * perf wup @ rgp.md * perf: @@ -45,6 +43,7 @@ uu3 2.2992 ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc ? coalesce errchan & done-writing w/ Err to RAC, and close-chan *and* EOSMarker -- ? + - funcptr away the ifs/ifsregex check in record-readers - checklist channelize all record-reader types - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly > needs factor for-loop to stateful so maybe not From 3faf66af215dbc6069f87609c972952b94bc1a23 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 9 Dec 2021 00:14:26 -0500 Subject: [PATCH 13/28] neaten --- .vimrc | 2 +- cmd/mlr/main.go | 4 +- cmd/mprof2/main.go | 2 +- internal/pkg/input/record_reader_dkvp.go | 183 +++++++++--------- internal/pkg/stream/stream.go | 22 +-- .../pkg/transformers/aaa_chain_transformer.go | 18 +- internal/pkg/transformers/head.go | 1 + 7 files changed, 115 insertions(+), 117 deletions(-) diff --git a/.vimrc b/.vimrc index ceb2dc6f50..18c0048c39 100644 --- a/.vimrc +++ b/.vimrc @@ -1,2 +1,2 @@ -map \d :w:!clear;echo Building ...; echo; make build +map \d :w:!clear;echo Building ...; echo; make mlr map \f :w:!clear;echo Building ...; echo; make mall diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go index 87505f1b61..0e8c9f3e4d 100644 --- a/cmd/mlr/main.go +++ b/cmd/mlr/main.go @@ -9,8 +9,8 @@ import ( "runtime/pprof" "strconv" - "github.com/pkg/profile" // for trace.out "github.com/johnkerl/miller/internal/pkg/entrypoint" + "github.com/pkg/profile" // for trace.out ) func main() { @@ -60,7 +60,7 @@ func main() { defer pprof.StopCPUProfile() fmt.Fprintf(os.Stderr, "CPU profile started.\n") - defer fmt.Fprintf(os.Stderr, "CPU profile finished: go tool pprof -http=:8080 %s\n", profFilename) + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename) } if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { diff --git a/cmd/mprof2/main.go b/cmd/mprof2/main.go index 1619bb2e83..b680e48a90 100644 --- a/cmd/mprof2/main.go +++ b/cmd/mprof2/main.go @@ -19,8 +19,8 @@ import ( "github.com/johnkerl/miller/internal/pkg/cli" "github.com/johnkerl/miller/internal/pkg/input" "github.com/johnkerl/miller/internal/pkg/lib" - "github.com/johnkerl/miller/internal/pkg/types" "github.com/johnkerl/miller/internal/pkg/output" + "github.com/johnkerl/miller/internal/pkg/types" ) func main() { diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index 5665b0c018..bca942c70e 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -77,69 +77,30 @@ func (reader *RecordReaderDKVP) processHandle( recordsPerBatch := reader.readerOptions.RecordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) - linesChannel := make(chan string, recordsPerBatch) + ////linesChannel := make(chan string, recordsPerBatch) + linesChannel := make(chan *list.List, recordsPerBatch) go provideChannelizedLines(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) eof := false for !eof { var recordsAndContexts *list.List recordsAndContexts, eof = reader.getRecordBatch(linesChannel, recordsPerBatch, context) - //fmt.Fprintf(os.Stderr, "GOT RECORD BATCH OF LENGTH %d\n", recordsAndContexts.Len()) readerChannel <- recordsAndContexts } } -// TODO: comment -func provideChannelizedLines( - lineScanner *bufio.Scanner, - linesChannel chan<- string, - downstreamDoneChannel <-chan bool, // for mlr head - recordsPerBatch int, -) { - i := 0 - done := false - for !done && lineScanner.Scan() { - i++ - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - if i%recordsPerBatch == 0 { - select { - case _ = <-downstreamDoneChannel: - done = true - break - default: - break - } - if done { - break - } - } - - linesChannel <- lineScanner.Text() - } - close(linesChannel) // end-of-stream marker -} - -//// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile //// TODO: comment //func provideChannelizedLines( // lineScanner *bufio.Scanner, -// linesChannel chan<- *list.List, +// linesChannel chan<- string, // downstreamDoneChannel <-chan bool, // for mlr head // recordsPerBatch int, //) { // i := 0 // done := false -// -// lines := list.New() -// // for !done && lineScanner.Scan() { // i++ // -// lines.PushBack(lineScanner.Text()) -// // // See if downstream processors will be ignoring further data (e.g. mlr // // head). If so, stop reading. This makes 'mlr head hugefile' exit // // quickly, as it should. @@ -154,21 +115,17 @@ func provideChannelizedLines( // if done { // break // } -// linesChannel <- lines -// lines = list.New() // } // -// //linesChannel <- lineScanner.Text() +// linesChannel <- lineScanner.Text() // } -// linesChannel <- lines // close(linesChannel) // end-of-stream marker //} -//// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile //// TODO: comment copiously we're trying to handle slow/fast/short/long //// reads: tail -f, smallfile, bigfile. //func (reader *RecordReaderDKVP) getRecordBatch( -// linesChannel <-chan *list.List, +// linesChannel <-chan string, // maxBatchSize int, // context *types.Context, //) ( @@ -177,24 +134,34 @@ func provideChannelizedLines( //) { // //fmt.Printf("GRB ENTER\n") // recordsAndContexts = list.New() +// eof = false // -// lines, more := <-linesChannel -// if !more { -// return recordsAndContexts, true -// } -// -// for e := lines.Front(); e != nil; e = e.Next() { -// line := e.Value.(string) +// for i := 0; i < maxBatchSize; i++ { +// //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) +// if len(linesChannel) == 0 && i > 0 { +// //fmt.Println(" .. BREAK") +// break +// } +// //fmt.Println(" .. B:BLOCK") +// line, more := <-linesChannel +// //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) +// if !more { +// eof = true +// break +// } // // // Check for comments-in-data feature -// if strings.HasPrefix(line, reader.readerOptions.CommentString) { -// if reader.readerOptions.CommentHandling == cli.PassComments { -// recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) -// continue -// } else if reader.readerOptions.CommentHandling == cli.SkipComments { -// continue +// // TODO: funcptr this away +// if reader.readerOptions.CommentHandling != cli.CommentsAreData { +// if strings.HasPrefix(line, reader.readerOptions.CommentString) { +// if reader.readerOptions.CommentHandling == cli.PassComments { +// recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) +// continue +// } else if reader.readerOptions.CommentHandling == cli.SkipComments { +// continue +// } +// // else comments are data // } -// // else comments are data // } // // record := reader.recordFromDKVPLine(line) @@ -204,49 +171,84 @@ func provideChannelizedLines( // } // // //fmt.Printf("GRB EXIT\n") -// return recordsAndContexts, false +// return recordsAndContexts, eof //} +// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile +// TODO: comment +func provideChannelizedLines( + lineScanner *bufio.Scanner, + linesChannel chan<- *list.List, + downstreamDoneChannel <-chan bool, // for mlr head + recordsPerBatch int, +) { + i := 0 + done := false + + lines := list.New() + + for lineScanner.Scan() { + i++ + + lines.PushBack(lineScanner.Text()) + + // See if downstream processors will be ignoring further data (e.g. mlr + // head). If so, stop reading. This makes 'mlr head hugefile' exit + // quickly, as it should. + if i%recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + linesChannel <- lines + lines = list.New() + } + + //linesChannel <- lineScanner.Text() + if done { + break + } + } + linesChannel <- lines + close(linesChannel) // end-of-stream marker +} + +// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile // TODO: comment copiously we're trying to handle slow/fast/short/long // reads: tail -f, smallfile, bigfile. func (reader *RecordReaderDKVP) getRecordBatch( - linesChannel <-chan string, + linesChannel <-chan *list.List, maxBatchSize int, context *types.Context, ) ( recordsAndContexts *list.List, eof bool, ) { - //fmt.Printf("GRB ENTER\n") recordsAndContexts = list.New() - eof = false - for i := 0; i < maxBatchSize; i++ { - //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) - if len(linesChannel) == 0 && i > 0 { - //fmt.Println(" .. BREAK") - break - } - //fmt.Println(" .. B:BLOCK") - line, more := <-linesChannel - //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) - if !more { - eof = true - break - } + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } + + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) // Check for comments-in-data feature - // TODO: funcptr this away - if reader.readerOptions.CommentHandling != cli.CommentsAreData { - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue - } - // else comments are data + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue } + // else comments are data } record := reader.recordFromDKVPLine(line) @@ -255,8 +257,7 @@ func (reader *RecordReaderDKVP) getRecordBatch( recordsAndContexts.PushBack(recordAndContext) } - //fmt.Printf("GRB EXIT\n") - return recordsAndContexts, eof + return recordsAndContexts, false } func (reader *RecordReaderDKVP) recordFromDKVPLine( diff --git a/internal/pkg/stream/stream.go b/internal/pkg/stream/stream.go index fcf40d63c9..054d667fdc 100644 --- a/internal/pkg/stream/stream.go +++ b/internal/pkg/stream/stream.go @@ -63,7 +63,7 @@ func Stream( } // Set up the reader-to-transformer and transformer-to-writer channels. - readerChannel := make(chan *list.List, 1) // list of *types.RecordAndContext + readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext writerChannel := make(chan *list.List, 1) // list of *types.RecordAndContext // We're done when a fatal error is registered on input (file not found, @@ -77,25 +77,17 @@ func Stream( // the record-reader which then stops reading input. This is necessary to // get quick response from, for example, mlr head -n 10 on input files with // millions or billions of records. - readerDownstreamDoneChannel := make(chan bool, 0) + readerDownstreamDoneChannel := make(chan bool, 1) // Start the reader, transformer, and writer. Let them run until fatal input // error or end-of-processing happens. bufferedOutputStream := bufio.NewWriter(outputStream) - if os.Getenv("MLR_BYPASS_CHAIN") == "true" { - // TODO: comment: for profiling - fmt.Fprintln(os.Stderr, "EXPERIMENTAL CHAIN BYPASS") - go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) - go output.ChannelWriter(readerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, - bufferedOutputStream, outputIsStdout) - } else { - go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) - go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, - writerChannel, options) - go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, - bufferedOutputStream, outputIsStdout) - } + go recordReader.Read(fileNames, *initialContext, readerChannel, errorChannel, readerDownstreamDoneChannel) + go transformers.ChainTransformer(readerChannel, readerDownstreamDoneChannel, recordTransformers, + writerChannel, options) + go output.ChannelWriter(writerChannel, recordWriter, &options.WriterOptions, doneWritingChannel, + bufferedOutputStream, outputIsStdout) done := false for !done { diff --git a/internal/pkg/transformers/aaa_chain_transformer.go b/internal/pkg/transformers/aaa_chain_transformer.go index 947c5eeeb1..81e954e509 100644 --- a/internal/pkg/transformers/aaa_chain_transformer.go +++ b/internal/pkg/transformers/aaa_chain_transformer.go @@ -236,15 +236,15 @@ func runSingleTransformerBatch( done := false for e := inputRecordsAndContexts.Front(); e != nil; e = e.Next() { - recordAndContext := e.Value.(*types.RecordAndContext) + inputRecordAndContext := e.Value.(*types.RecordAndContext) // --nr-progress-mod // TODO: function-pointer this away to reduce instruction count in the // normal case which it isn't used at all. No need to test if {static thing} != 0 // on every record. if options.NRProgressMod != 0 { - if isFirstInChain && recordAndContext.Record != nil { - context := &recordAndContext.Context + if isFirstInChain && inputRecordAndContext.Record != nil { + context := &inputRecordAndContext.Context if context.NR%options.NRProgressMod == 0 { fmt.Fprintf(os.Stderr, "NR=%d FNR=%d FILENAME=%s\n", context.NR, context.FNR, context.FILENAME) } @@ -265,9 +265,9 @@ func runSingleTransformerBatch( // the output channel without involving the record-transformer, since // there is no record to be transformed. - if recordAndContext.EndOfStream == true || recordAndContext.Record != nil { + if inputRecordAndContext.EndOfStream == true || inputRecordAndContext.Record != nil { recordTransformer.Transform( - recordAndContext, + inputRecordAndContext, outputRecordsAndContexts, // TODO: maybe refactor these out of each transformer. // And/or maybe poll them once per batch not once per record. @@ -275,13 +275,17 @@ func runSingleTransformerBatch( outputDownstreamDoneChannel, ) } else { - outputRecordsAndContexts.PushBack(recordAndContext) + outputRecordsAndContexts.PushBack(inputRecordAndContext) } - if recordAndContext.EndOfStream { + if inputRecordAndContext.EndOfStream { done = true break } + + if done { + break + } } outputRecordChannel <- outputRecordsAndContexts diff --git a/internal/pkg/transformers/head.go b/internal/pkg/transformers/head.go index 05235f8b70..3b0526fb22 100644 --- a/internal/pkg/transformers/head.go +++ b/internal/pkg/transformers/head.go @@ -159,6 +159,7 @@ func (tr *TransformerHead) transformUnkeyed( // Signify to data producers upstream that we'll ignore further // data, so as far as we're concerned they can stop sending it. See // ChainTransformer. + //TODO: maybe remove: outputRecordsAndContexts.PushBack(types.NewEndOfStreamMarker(&inrecAndContext.Context)) outputDownstreamDoneChannel <- true tr.wroteDownstreamDone = true } From 5666b599a7ddc38561b52cd807527eab6ecc9c60 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 9 Dec 2021 00:29:28 -0500 Subject: [PATCH 14/28] channelize nidx --- internal/pkg/input/record_reader.go | 43 ++++++ internal/pkg/input/record_reader_dkvp.go | 160 +++-------------------- internal/pkg/input/record_reader_nidx.go | 67 ++++++---- todo.txt | 2 +- 4 files changed, 100 insertions(+), 172 deletions(-) diff --git a/internal/pkg/input/record_reader.go b/internal/pkg/input/record_reader.go index f0a58c4086..8fbe42b537 100644 --- a/internal/pkg/input/record_reader.go +++ b/internal/pkg/input/record_reader.go @@ -77,3 +77,46 @@ func NewLineScanner(handle io.Reader, irs string) *bufio.Scanner { return scanner } + +// TODO: comment +func channelizedLineScanner( + lineScanner *bufio.Scanner, + linesChannel chan<- *list.List, + downstreamDoneChannel <-chan bool, // for mlr head + recordsPerBatch int, +) { + i := 0 + done := false + + lines := list.New() + + for lineScanner.Scan() { + i++ + + lines.PushBack(lineScanner.Text()) + + // See if downstream processors will be ignoring further data (e.g. mlr + // head). If so, stop reading. This makes 'mlr head hugefile' exit + // quickly, as it should. + if i%recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + linesChannel <- lines + lines = list.New() + } + + if done { + break + } + } + linesChannel <- lines + close(linesChannel) // end-of-stream marker +} diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp.go index bca942c70e..255834cbad 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp.go @@ -1,7 +1,6 @@ package input import ( - "bufio" "container/list" "io" "strconv" @@ -77,151 +76,19 @@ func (reader *RecordReaderDKVP) processHandle( recordsPerBatch := reader.readerOptions.RecordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) - ////linesChannel := make(chan string, recordsPerBatch) linesChannel := make(chan *list.List, recordsPerBatch) - go provideChannelizedLines(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) - eof := false - for !eof { - var recordsAndContexts *list.List - recordsAndContexts, eof = reader.getRecordBatch(linesChannel, recordsPerBatch, context) + for { + recordsAndContexts, eof := reader.getRecordBatch(linesChannel, recordsPerBatch, context) readerChannel <- recordsAndContexts - } -} - -//// TODO: comment -//func provideChannelizedLines( -// lineScanner *bufio.Scanner, -// linesChannel chan<- string, -// downstreamDoneChannel <-chan bool, // for mlr head -// recordsPerBatch int, -//) { -// i := 0 -// done := false -// for !done && lineScanner.Scan() { -// i++ -// -// // See if downstream processors will be ignoring further data (e.g. mlr -// // head). If so, stop reading. This makes 'mlr head hugefile' exit -// // quickly, as it should. -// if i%recordsPerBatch == 0 { -// select { -// case _ = <-downstreamDoneChannel: -// done = true -// break -// default: -// break -// } -// if done { -// break -// } -// } -// -// linesChannel <- lineScanner.Text() -// } -// close(linesChannel) // end-of-stream marker -//} - -//// TODO: comment copiously we're trying to handle slow/fast/short/long -//// reads: tail -f, smallfile, bigfile. -//func (reader *RecordReaderDKVP) getRecordBatch( -// linesChannel <-chan string, -// maxBatchSize int, -// context *types.Context, -//) ( -// recordsAndContexts *list.List, -// eof bool, -//) { -// //fmt.Printf("GRB ENTER\n") -// recordsAndContexts = list.New() -// eof = false -// -// for i := 0; i < maxBatchSize; i++ { -// //fmt.Fprintf(os.Stderr, "-- %d/%d %d/%d\n", i, maxBatchSize, len(linesChannel), cap(linesChannel)) -// if len(linesChannel) == 0 && i > 0 { -// //fmt.Println(" .. BREAK") -// break -// } -// //fmt.Println(" .. B:BLOCK") -// line, more := <-linesChannel -// //fmt.Printf(" .. E:BLOCK <<%s>> %v\n", line, more) -// if !more { -// eof = true -// break -// } -// -// // Check for comments-in-data feature -// // TODO: funcptr this away -// if reader.readerOptions.CommentHandling != cli.CommentsAreData { -// if strings.HasPrefix(line, reader.readerOptions.CommentString) { -// if reader.readerOptions.CommentHandling == cli.PassComments { -// recordsAndContexts.PushBack(types.NewOutputStringList(line+"\n", context)) -// continue -// } else if reader.readerOptions.CommentHandling == cli.SkipComments { -// continue -// } -// // else comments are data -// } -// } -// -// record := reader.recordFromDKVPLine(line) -// context.UpdateForInputRecord() -// recordAndContext := types.NewRecordAndContext(record, context) -// recordsAndContexts.PushBack(recordAndContext) -// } -// -// //fmt.Printf("GRB EXIT\n") -// return recordsAndContexts, eof -//} - -// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile -// TODO: comment -func provideChannelizedLines( - lineScanner *bufio.Scanner, - linesChannel chan<- *list.List, - downstreamDoneChannel <-chan bool, // for mlr head - recordsPerBatch int, -) { - i := 0 - done := false - - lines := list.New() - - for lineScanner.Scan() { - i++ - - lines.PushBack(lineScanner.Text()) - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - if i%recordsPerBatch == 0 { - select { - case _ = <-downstreamDoneChannel: - done = true - break - default: - break - } - if done { - break - } - linesChannel <- lines - lines = list.New() - } - - //linesChannel <- lineScanner.Text() - if done { + if eof { break } } - linesChannel <- lines - close(linesChannel) // end-of-stream marker } -// TODO: productionalize this for the case no-head -- if profiling shows it to be worthwhile -// TODO: comment copiously we're trying to handle slow/fast/short/long -// reads: tail -f, smallfile, bigfile. +// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. func (reader *RecordReaderDKVP) getRecordBatch( linesChannel <-chan *list.List, maxBatchSize int, @@ -241,14 +108,17 @@ func (reader *RecordReaderDKVP) getRecordBatch( line := e.Value.(string) // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data } - // else comments are data } record := reader.recordFromDKVPLine(line) diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go index 695d443907..b773c5c845 100644 --- a/internal/pkg/input/record_reader_nidx.go +++ b/internal/pkg/input/record_reader_nidx.go @@ -73,47 +73,62 @@ func (reader *RecordReaderNIDX) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) + recordsPerBatch := reader.readerOptions.RecordsPerBatch - scanner := NewLineScanner(handle, reader.readerOptions.IRS) + lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + linesChannel := make(chan *list.List, recordsPerBatch) + go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) - for scanner.Scan() { - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - eof := false - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } + for { + recordsAndContexts, eof := reader.getRecordBatch(linesChannel, recordsPerBatch, context) + readerChannel <- recordsAndContexts if eof { break } + } +} - line := scanner.Text() +// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderNIDX) getRecordBatch( + linesChannel <-chan *list.List, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() + + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } + + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputStringList(line+"\n", context) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data } - // else comments are data } record := reader.recordFromNIDXLine(line) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordAndContext := types.NewRecordAndContext(record, context) + recordsAndContexts.PushBack(recordAndContext) } + + return recordsAndContexts, false } // ---------------------------------------------------------------- diff --git a/todo.txt b/todo.txt index 9ac01a4cb0..f144c4258e 100644 --- a/todo.txt +++ b/todo.txt @@ -38,7 +38,7 @@ PUNCHDOWN LIST - reads > everything else - reads and writes - ! fix mlr head -n 1 + a remove NewOutputStringList and NewRecordAndContextList ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc From 79bc7fdc20943b862ea29f863aaffb3b3d3e6b6a Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 9 Dec 2021 00:33:59 -0500 Subject: [PATCH 15/28] cmd/mprof5 --- cmd/mprof5/main.go | 84 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 cmd/mprof5/main.go diff --git a/cmd/mprof5/main.go b/cmd/mprof5/main.go new file mode 100644 index 0000000000..5a6dba8b7a --- /dev/null +++ b/cmd/mprof5/main.go @@ -0,0 +1,84 @@ +// Experiments in performance/profiling. +package main + +import ( + "fmt" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + + "github.com/pkg/profile" // for trace.out + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/stream" + "github.com/johnkerl/miller/internal/pkg/transformers" + "github.com/johnkerl/miller/internal/pkg/types" +) + +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + if os.Getenv("MPROF_PPROF") != "" { + // profiling with cpu.pprof and go tool pprof -http=:8080 cpu.pprof + profFilename := "cpu.pprof" + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + fmt.Fprintf(os.Stderr, "go tool pprof -http=:8080 cpu.pprof\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\n") + } + + if os.Getenv("MPROF_TRACE") != "" { + // tracing with trace.out and go tool trace trace.out + fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + } + + options := cli.DefaultOptions() + types.SetInferrerStringOnly() + + cat, err := transformers.NewTransformerCat(false, "", nil) + if err != nil { + fmt.Fprintf(os.Stderr, "mprof5: %v\n", err) + os.Exit(1) + } + xforms := []transformers.IRecordTransformer{cat} + + filenames := os.Args[1:] + + err = stream.Stream(filenames, options, xforms, os.Stdout, true) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) + os.Exit(1) + } +} From d183d1191c9aa1cd73d720496bd38c957707ad0f Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 9 Dec 2021 01:36:41 -0500 Subject: [PATCH 16/28] channelize CSV reader --- internal/pkg/input/record_reader_csv.go | 162 ++++++++++++++++-------- 1 file changed, 108 insertions(+), 54 deletions(-) diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index 43cbb96885..aee3ef74ac 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -20,6 +20,11 @@ type RecordReaderCSV struct { readerOptions *cli.TReaderOptions recordsPerBatch int ifs0 byte // Go's CSV library only lets its 'Comma' be a single character + + filename string + rowNumber int + needHeader bool + header []string } // ---------------------------------------------------------------- @@ -79,7 +84,6 @@ func (reader *RecordReaderCSV) Read( readerChannel <- types.NewEndOfStreamMarkerList(&context) } -// ---------------------------------------------------------------- func (reader *RecordReaderCSV) processHandle( handle io.Reader, filename string, @@ -89,53 +93,44 @@ func (reader *RecordReaderCSV) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) - needHeader := !reader.readerOptions.UseImplicitCSVHeader - var header []string = nil - var rowNumber int = 0 + recordsPerBatch := reader.readerOptions.RecordsPerBatch + + // Reset state for start of next input file + reader.filename = filename + reader.rowNumber = 0 + reader.needHeader = !reader.readerOptions.UseImplicitCSVHeader + reader.header = nil csvReader := csv.NewReader(NewBOMStrippingReader(handle)) csvReader.Comma = rune(reader.ifs0) + csvRecordsChannel := make(chan *list.List, recordsPerBatch) + go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel, + recordsPerBatch) - eof := false for { - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } + recordsAndContexts, eof := reader.getRecordBatch(csvRecordsChannel, errorChannel, recordsPerBatch, context) + readerChannel <- recordsAndContexts if eof { break } + } +} - if needHeader { - // TODO: make this a helper function - csvRecord, err := csvReader.Read() - if lib.IsEOF(err) { - break - } - if err != nil && csvRecord == nil { - // See https://golang.org/pkg/encoding/csv. - // We handle field-count ourselves. - errorChannel <- err - return - } - - isData := reader.maybeConsumeComment(csvRecord, context, readerChannel) - if !isData { - continue - } +// TODO: comment +func channelizedCSVRecordScanner( + csvReader *csv.Reader, + csvRecordsChannel chan<- *list.List, + downstreamDoneChannel <-chan bool, // for mlr head + errorChannel chan error, + recordsPerBatch int, +) { + i := 0 + done := false - header = csvRecord - rowNumber++ + csvRecords := list.New() - needHeader = false - } + for { + i++ csvRecord, err := csvReader.Read() if lib.IsEOF(err) { @@ -145,31 +140,91 @@ func (reader *RecordReaderCSV) processHandle( // See https://golang.org/pkg/encoding/csv. // We handle field-count ourselves. errorChannel <- err - return + break + } + + csvRecords.PushBack(csvRecord) + + // See if downstream processors will be ignoring further data (e.g. mlr + // head). If so, stop reading. This makes 'mlr head hugefile' exit + // quickly, as it should. + if i%recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + csvRecordsChannel <- csvRecords + csvRecords = list.New() + } + + if done { + break + } + } + csvRecordsChannel <- csvRecords + close(csvRecordsChannel) // end-of-stream marker +} + +// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderCSV) getRecordBatch( + csvRecordsChannel <-chan *list.List, + errorChannel chan error, + maxBatchSize int, + context *types.Context, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() + + csvRecords, more := <-csvRecordsChannel + if !more { + return recordsAndContexts, true + } + + for e := csvRecords.Front(); e != nil; e = e.Next() { + csvRecord := e.Value.([]string) + + if reader.needHeader { + isData := reader.maybeConsumeComment(csvRecord, context, recordsAndContexts) + if !isData { + continue + } + + reader.header = csvRecord + reader.rowNumber++ + reader.needHeader = false + continue } - rowNumber++ - isData := reader.maybeConsumeComment(csvRecord, context, readerChannel) + isData := reader.maybeConsumeComment(csvRecord, context, recordsAndContexts) if !isData { continue } + reader.rowNumber++ - if header == nil { // implicit CSV header + if reader.header == nil { // implicit CSV header n := len(csvRecord) - header = make([]string, n) + reader.header = make([]string, n) for i := 0; i < n; i++ { - header[i] = strconv.Itoa(i + 1) + reader.header[i] = strconv.Itoa(i + 1) } } record := types.NewMlrmapAsRecord() - nh := len(header) + nh := len(reader.header) nd := len(csvRecord) if nh == nd { for i := 0; i < nh; i++ { - key := header[i] + key := reader.header[i] value := types.MlrvalFromInferredTypeForDataFiles(csvRecord[i]) record.PutReference(key, value) } @@ -180,7 +235,7 @@ func (reader *RecordReaderCSV) processHandle( fmt.Sprintf( "mlr: CSV header/data length mismatch %d != %d "+ "at filename %s row %d.\n", - nh, nd, filename, rowNumber, + nh, nd, reader.filename, reader.rowNumber, ), ) errorChannel <- err @@ -189,7 +244,7 @@ func (reader *RecordReaderCSV) processHandle( i := 0 n := lib.IntMin2(nh, nd) for i = 0; i < n; i++ { - key := header[i] + key := reader.header[i] value := types.MlrvalFromInferredTypeForDataFiles(csvRecord[i]) record.PutReference(key, value) } @@ -202,7 +257,7 @@ func (reader *RecordReaderCSV) processHandle( if nh > nd { // if header longer than data: use "" values for i = nd; i < nh; i++ { - record.PutCopy(header[i], types.MLRVAL_VOID) + record.PutCopy(reader.header[i], types.MLRVAL_VOID) } } } @@ -210,11 +265,10 @@ func (reader *RecordReaderCSV) processHandle( context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } + + return recordsAndContexts, false } // maybeConsumeComment returns true if the CSV record should be processed as @@ -222,7 +276,7 @@ func (reader *RecordReaderCSV) processHandle( func (reader *RecordReaderCSV) maybeConsumeComment( csvRecord []string, context *types.Context, - readerChannel chan<- *list.List, // list of *types.RecordAndContext + recordsAndContexts *list.List, // list of *types.RecordAndContext ) bool { if reader.readerOptions.CommentHandling == cli.CommentsAreData { // Nothing is to be construed as a comment @@ -255,7 +309,7 @@ func (reader *RecordReaderCSV) maybeConsumeComment( csvWriter.Comma = rune(reader.ifs0) csvWriter.Write(csvRecord) csvWriter.Flush() - readerChannel <- types.NewOutputStringList(buffer.String(), context) + recordsAndContexts.PushBack(types.NewOutputString(buffer.String(), context)) } else /* reader.readerOptions.CommentHandling == cli.SkipComments */ { // discard entirely } From 2618d63e6a1a51b796cc13fb70b9f1e7a4971ca7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 10 Dec 2021 21:43:42 -0500 Subject: [PATCH 17/28] channelize NIDX reader --- internal/pkg/input/record_reader_nidx.go | 7 +++---- internal/pkg/input/record_reader_xtab.go | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go index b773c5c845..3aade91943 100644 --- a/internal/pkg/input/record_reader_nidx.go +++ b/internal/pkg/input/record_reader_nidx.go @@ -68,8 +68,8 @@ func (reader *RecordReaderNIDX) processHandle( handle io.Reader, filename string, context *types.Context, - readerChannel chan<- *list.List, // list of *types.RecordAndContext - errorChannel chan error, + readerChannel chan<- *list.List, + errorChannel chan<- error, downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) @@ -122,7 +122,6 @@ func (reader *RecordReaderNIDX) getRecordBatch( } record := reader.recordFromNIDXLine(line) - context.UpdateForInputRecord() recordAndContext := types.NewRecordAndContext(record, context) recordsAndContexts.PushBack(recordAndContext) @@ -131,7 +130,7 @@ func (reader *RecordReaderNIDX) getRecordBatch( return recordsAndContexts, false } -// ---------------------------------------------------------------- + func (reader *RecordReaderNIDX) recordFromNIDXLine( line string, ) *types.Mlrmap { diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 66c1d7a43c..6b0f629d65 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -77,7 +77,7 @@ func (reader *RecordReaderXTAB) processHandle( ) { context.UpdateForStartOfFile(filename) - scanner := NewLineScanner(handle, reader.readerOptions.IFS) + lineScanner := NewLineScanner(handle, reader.readerOptions.IFS) linesForRecord := list.New() @@ -98,7 +98,7 @@ func (reader *RecordReaderXTAB) processHandle( break } - if !scanner.Scan() { + if !lineScanner.Scan() { if linesForRecord.Len() > 0 { record, err := reader.recordFromXTABLines(linesForRecord) @@ -114,7 +114,7 @@ func (reader *RecordReaderXTAB) processHandle( break } - line := scanner.Text() + line := lineScanner.Text() // Check for comments-in-data feature if strings.HasPrefix(line, reader.readerOptions.CommentString) { From 4df477f73480937c552844dee052fdecf81981e6 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 10 Dec 2021 22:02:20 -0500 Subject: [PATCH 18/28] Dedupe DKVP-reader and NIDX-reader source files --- internal/pkg/input/record_reader_csvlite.go | 19 +-- ...der_dkvp.go => record_reader_dkvp_nidx.go} | 62 +++++-- internal/pkg/input/record_reader_nidx.go | 157 ------------------ 3 files changed, 58 insertions(+), 180 deletions(-) rename internal/pkg/input/{record_reader_dkvp.go => record_reader_dkvp_nidx.go} (72%) delete mode 100644 internal/pkg/input/record_reader_nidx.go diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index e75966948b..b8150aee2f 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -31,13 +31,11 @@ import ( "github.com/johnkerl/miller/internal/pkg/types" ) -// ---------------------------------------------------------------- type RecordReaderCSVLite struct { readerOptions *cli.TReaderOptions recordsPerBatch int } -// ---------------------------------------------------------------- func NewRecordReaderCSVLite( readerOptions *cli.TReaderOptions, recordsPerBatch int, @@ -48,7 +46,6 @@ func NewRecordReaderCSVLite( }, nil } -// ---------------------------------------------------------------- func NewRecordReaderPPRINT( readerOptions *cli.TReaderOptions, recordsPerBatch int, @@ -59,7 +56,6 @@ func NewRecordReaderPPRINT( }, nil } -// ---------------------------------------------------------------- func (reader *RecordReaderCSVLite) Read( filenames []string, context types.Context, @@ -134,7 +130,6 @@ func (reader *RecordReaderCSVLite) Read( readerChannel <- types.NewEndOfStreamMarkerList(&context) } -// ---------------------------------------------------------------- func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( handle io.Reader, filename string, @@ -148,8 +143,8 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( context.UpdateForStartOfFile(filename) - scanner := NewLineScanner(handle, reader.readerOptions.IRS) - for scanner.Scan() { + lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + for lineScanner.Scan() { // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit @@ -166,7 +161,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( break } - line := scanner.Text() + line := lineScanner.Text() inputLineNumber++ @@ -261,7 +256,6 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( } } -// ---------------------------------------------------------------- func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( handle io.Reader, filename string, @@ -275,8 +269,8 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( context.UpdateForStartOfFile(filename) - scanner := NewLineScanner(handle, reader.readerOptions.IRS) - for scanner.Scan() { + lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + for lineScanner.Scan() { // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit @@ -295,8 +289,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( break } - // TODO: IRS - line := scanner.Text() + line := lineScanner.Text() inputLineNumber++ diff --git a/internal/pkg/input/record_reader_dkvp.go b/internal/pkg/input/record_reader_dkvp_nidx.go similarity index 72% rename from internal/pkg/input/record_reader_dkvp.go rename to internal/pkg/input/record_reader_dkvp_nidx.go index 255834cbad..3672dedbd2 100644 --- a/internal/pkg/input/record_reader_dkvp.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -1,3 +1,5 @@ +// This is mostly-identical code for the DKVP and NIDX record-readers. + package input import ( @@ -11,22 +13,39 @@ import ( "github.com/johnkerl/miller/internal/pkg/types" ) -type RecordReaderDKVP struct { +// splitter_DKVP_NIDX is a function type for the one bit of code differing +// between the DKVP reader and the NIDX reader, namely, how it splits lines. +type splitter_DKVP_NIDX func (reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap + +type RecordReaderDKVPNIDX struct { readerOptions *cli.TReaderOptions recordsPerBatch int + splitter splitter_DKVP_NIDX } func NewRecordReaderDKVP( readerOptions *cli.TReaderOptions, recordsPerBatch int, -) (*RecordReaderDKVP, error) { - return &RecordReaderDKVP{ +) (*RecordReaderDKVPNIDX, error) { + return &RecordReaderDKVPNIDX{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, + splitter: recordFromDKVPLine, }, nil } -func (reader *RecordReaderDKVP) Read( +func NewRecordReaderNIDX( + readerOptions *cli.TReaderOptions, + recordsPerBatch int, +) (*RecordReaderDKVPNIDX, error) { + return &RecordReaderDKVPNIDX{ + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, + splitter: recordFromNIDXLine, + }, nil +} + +func (reader *RecordReaderDKVPNIDX) Read( filenames []string, context types.Context, readerChannel chan<- *list.List, // list of *types.RecordAndContext @@ -64,7 +83,7 @@ func (reader *RecordReaderDKVP) Read( readerChannel <- types.NewEndOfStreamMarkerList(&context) } -func (reader *RecordReaderDKVP) processHandle( +func (reader *RecordReaderDKVPNIDX) processHandle( handle io.Reader, filename string, context *types.Context, @@ -89,7 +108,7 @@ func (reader *RecordReaderDKVP) processHandle( } // TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. -func (reader *RecordReaderDKVP) getRecordBatch( +func (reader *RecordReaderDKVPNIDX) getRecordBatch( linesChannel <-chan *list.List, maxBatchSize int, context *types.Context, @@ -121,7 +140,7 @@ func (reader *RecordReaderDKVP) getRecordBatch( } } - record := reader.recordFromDKVPLine(line) + record := reader.splitter(reader, line) context.UpdateForInputRecord() recordAndContext := types.NewRecordAndContext(record, context) recordsAndContexts.PushBack(recordAndContext) @@ -130,9 +149,7 @@ func (reader *RecordReaderDKVP) getRecordBatch( return recordsAndContexts, false } -func (reader *RecordReaderDKVP) recordFromDKVPLine( - line string, -) *types.Mlrmap { +func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap { record := types.NewMlrmapAsRecord() var pairs []string @@ -168,3 +185,28 @@ func (reader *RecordReaderDKVP) recordFromDKVPLine( } return record } + +func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap { + record := types.NewMlrmapAsRecord() + + var values []string + // TODO: func-pointer this away + if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex + values = lib.SplitString(line, reader.readerOptions.IFS) + } else { + values = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) + } + + if reader.readerOptions.AllowRepeatIFS { + values = lib.StripEmpties(values) // left/right trim + } + + var i int = 0 + for _, value := range values { + i++ + key := strconv.Itoa(i) + mval := types.MlrvalFromInferredTypeForDataFiles(value) + record.PutReference(key, mval) + } + return record +} diff --git a/internal/pkg/input/record_reader_nidx.go b/internal/pkg/input/record_reader_nidx.go deleted file mode 100644 index 3aade91943..0000000000 --- a/internal/pkg/input/record_reader_nidx.go +++ /dev/null @@ -1,157 +0,0 @@ -package input - -import ( - "container/list" - "io" - "strconv" - "strings" - - "github.com/johnkerl/miller/internal/pkg/cli" - "github.com/johnkerl/miller/internal/pkg/lib" - "github.com/johnkerl/miller/internal/pkg/types" -) - -type RecordReaderNIDX struct { - readerOptions *cli.TReaderOptions - recordsPerBatch int -} - -func NewRecordReaderNIDX( - readerOptions *cli.TReaderOptions, - recordsPerBatch int, -) (*RecordReaderNIDX, error) { - return &RecordReaderNIDX{ - readerOptions: readerOptions, - recordsPerBatch: recordsPerBatch, - }, nil -} - -func (reader *RecordReaderNIDX) Read( - filenames []string, - context types.Context, - readerChannel chan<- *list.List, // list of *types.RecordAndContext - errorChannel chan error, - downstreamDoneChannel <-chan bool, // for mlr head -) { - if filenames != nil { // nil for mlr -n - if len(filenames) == 0 { // read from stdin - handle, err := lib.OpenStdin( - reader.readerOptions.Prepipe, - reader.readerOptions.PrepipeIsRaw, - reader.readerOptions.FileInputEncoding, - ) - if err != nil { - errorChannel <- err - } - reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) - } else { - for _, filename := range filenames { - handle, err := lib.OpenFileForRead( - filename, - reader.readerOptions.Prepipe, - reader.readerOptions.PrepipeIsRaw, - reader.readerOptions.FileInputEncoding, - ) - if err != nil { - errorChannel <- err - } else { - reader.processHandle(handle, filename, &context, readerChannel, errorChannel, downstreamDoneChannel) - handle.Close() - } - } - } - } - readerChannel <- types.NewEndOfStreamMarkerList(&context) -} - -func (reader *RecordReaderNIDX) processHandle( - handle io.Reader, - filename string, - context *types.Context, - readerChannel chan<- *list.List, - errorChannel chan<- error, - downstreamDoneChannel <-chan bool, // for mlr head -) { - context.UpdateForStartOfFile(filename) - recordsPerBatch := reader.readerOptions.RecordsPerBatch - - lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) - linesChannel := make(chan *list.List, recordsPerBatch) - go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) - - for { - recordsAndContexts, eof := reader.getRecordBatch(linesChannel, recordsPerBatch, context) - readerChannel <- recordsAndContexts - if eof { - break - } - } -} - -// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. -func (reader *RecordReaderNIDX) getRecordBatch( - linesChannel <-chan *list.List, - maxBatchSize int, - context *types.Context, -) ( - recordsAndContexts *list.List, - eof bool, -) { - recordsAndContexts = list.New() - - lines, more := <-linesChannel - if !more { - return recordsAndContexts, true - } - - for e := lines.Front(); e != nil; e = e.Next() { - line := e.Value.(string) - - // Check for comments-in-data feature - // TODO: function-pointer this away - if reader.readerOptions.CommentHandling != cli.CommentsAreData { - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue - } - // else comments are data - } - } - - record := reader.recordFromNIDXLine(line) - context.UpdateForInputRecord() - recordAndContext := types.NewRecordAndContext(record, context) - recordsAndContexts.PushBack(recordAndContext) - } - - return recordsAndContexts, false -} - - -func (reader *RecordReaderNIDX) recordFromNIDXLine( - line string, -) *types.Mlrmap { - record := types.NewMlrmapAsRecord() - - var values []string - if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex - values = lib.SplitString(line, reader.readerOptions.IFS) - } else { - values = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) - } - if reader.readerOptions.AllowRepeatIFS { - values = lib.StripEmpties(values) // left/right trim - } - - var i int = 0 - for _, value := range values { - i++ - key := strconv.Itoa(i) - mval := types.MlrvalFromInferredTypeForDataFiles(value) - record.PutReference(key, mval) - } - return record -} From be3286f6064376f5087f6b496c8b67a3966a69b7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 10 Dec 2021 22:45:32 -0500 Subject: [PATCH 19/28] channelize CSV-lite reader --- internal/pkg/input/record_reader_csv.go | 3 +- internal/pkg/input/record_reader_csvlite.go | 274 +++++++++--------- internal/pkg/input/record_reader_dkvp_nidx.go | 3 +- todo.txt | 1 + 4 files changed, 142 insertions(+), 139 deletions(-) diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index aee3ef74ac..1c4bbf5d40 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -108,7 +108,7 @@ func (reader *RecordReaderCSV) processHandle( recordsPerBatch) for { - recordsAndContexts, eof := reader.getRecordBatch(csvRecordsChannel, errorChannel, recordsPerBatch, context) + recordsAndContexts, eof := reader.getRecordBatch(csvRecordsChannel, errorChannel, context) readerChannel <- recordsAndContexts if eof { break @@ -175,7 +175,6 @@ func channelizedCSVRecordScanner( func (reader *RecordReaderCSV) getRecordBatch( csvRecordsChannel <-chan *list.List, errorChannel chan error, - maxBatchSize int, context *types.Context, ) ( recordsAndContexts *list.List, diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index b8150aee2f..b32e591614 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -31,19 +31,44 @@ import ( "github.com/johnkerl/miller/internal/pkg/types" ) +// recordBatchGetterCSV points to either an explicit-CSV-header or +// implicit-CSV-header record-batch getter. +type recordBatchGetterCSV func( + reader *RecordReaderCSVLite, + linesChannel <-chan *list.List, + filename string, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) + type RecordReaderCSVLite struct { readerOptions *cli.TReaderOptions recordsPerBatch int + + recordBatchGetter recordBatchGetterCSV + + inputLineNumber int + headerStrings []string } func NewRecordReaderCSVLite( readerOptions *cli.TReaderOptions, recordsPerBatch int, ) (*RecordReaderCSVLite, error) { - return &RecordReaderCSVLite{ + reader := &RecordReaderCSVLite{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - }, nil + } + if reader.readerOptions.UseImplicitCSVHeader { + reader.recordBatchGetter = getRecordBatchImplicitCSVHeader + } else { + reader.recordBatchGetter = getRecordBatchExplicitCSVHeader + } + + return reader, nil } func NewRecordReaderPPRINT( @@ -72,26 +97,16 @@ func (reader *RecordReaderCSVLite) Read( ) if err != nil { errorChannel <- err + return } - if reader.readerOptions.UseImplicitCSVHeader { - reader.processHandleImplicitCSVHeader( - handle, - "(stdin)", - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - } else { - reader.processHandleExplicitCSVHeader( - handle, - "(stdin)", - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - } + reader.processHandle( + handle, + "(stdin)", + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -102,35 +117,24 @@ func (reader *RecordReaderCSVLite) Read( ) if err != nil { errorChannel <- err - } else { - if reader.readerOptions.UseImplicitCSVHeader { - reader.processHandleImplicitCSVHeader( - handle, - filename, - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - } else { - reader.processHandleExplicitCSVHeader( - handle, - filename, - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - } - handle.Close() + return } + reader.processHandle( + handle, + filename, + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) + handle.Close() } } } readerChannel <- types.NewEndOfStreamMarkerList(&context) } -func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( +func (reader *RecordReaderCSVLite) processHandle( handle io.Reader, filename string, context *types.Context, @@ -138,54 +142,70 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( errorChannel chan error, downstreamDoneChannel <-chan bool, // for mlr head ) { - var inputLineNumber int = 0 - var headerStrings []string = nil - context.UpdateForStartOfFile(filename) + reader.inputLineNumber = 0 + reader.headerStrings = nil + recordsPerBatch := reader.readerOptions.RecordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) - for lineScanner.Scan() { - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - eof := false - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } + linesChannel := make(chan *list.List, recordsPerBatch) + go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + + for { + recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel) + readerChannel <- recordsAndContexts if eof { break } + } +} + +func getRecordBatchExplicitCSVHeader( + reader *RecordReaderCSVLite, + linesChannel <-chan *list.List, + filename string, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() + + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } - line := lineScanner.Text() + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) - inputLineNumber++ + reader.inputLineNumber++ // Strip CSV BOM - if inputLineNumber == 1 { + if reader.inputLineNumber == 1 { if strings.HasPrefix(line, CSV_BOM) { line = strings.Replace(line, CSV_BOM, "", 1) } } // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputStringList(line+"\n", context) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data } - // else comments are data } if line == "" { // Reset to new schema - headerStrings = nil + reader.headerStrings = nil continue } @@ -199,16 +219,16 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( fields = lib.StripEmpties(fields) // left/right trim } - if headerStrings == nil { - headerStrings = fields + if reader.headerStrings == nil { + reader.headerStrings = fields // Get data lines on subsequent loop iterations } else { - if !reader.readerOptions.AllowRaggedCSVInput && len(headerStrings) != len(fields) { + if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := errors.New( fmt.Sprintf( "mlr: CSV header/data length mismatch %d != %d "+ "at filename %s line %d.\n", - len(headerStrings), len(fields), filename, inputLineNumber, + len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ), ) errorChannel <- err @@ -219,16 +239,16 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( if !reader.readerOptions.AllowRaggedCSVInput { for i, field := range fields { value := types.MlrvalFromInferredTypeForDataFiles(field) - record.PutCopy(headerStrings[i], value) + record.PutCopy(reader.headerStrings[i], value) } } else { - nh := len(headerStrings) + nh := len(reader.headerStrings) nd := len(fields) n := lib.IntMin2(nh, nd) var i int for i = 0; i < n; i++ { value := types.MlrvalFromInferredTypeForDataFiles(fields[i]) - record.PutCopy(headerStrings[i], value) + record.PutCopy(reader.headerStrings[i], value) } if nh < nd { // if header shorter than data: use 1-up itoa keys @@ -241,82 +261,68 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader( if nh > nd { // if header longer than data: use "" values for i = nd; i < nh; i++ { - record.PutCopy(headerStrings[i], types.MLRVAL_VOID) + record.PutCopy(reader.headerStrings[i], types.MLRVAL_VOID) } } } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } - } + + return recordsAndContexts, false } -func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( - handle io.Reader, +func getRecordBatchImplicitCSVHeader( + reader *RecordReaderCSVLite, + linesChannel <-chan *list.List, filename string, context *types.Context, - readerChannel chan<- *list.List, // list of *types.RecordAndContext errorChannel chan error, - downstreamDoneChannel <-chan bool, // for mlr head +) ( + recordsAndContexts *list.List, + eof bool, ) { - var inputLineNumber int = 0 - var headerStrings []string = nil - - context.UpdateForStartOfFile(filename) + recordsAndContexts = list.New() - lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) - for lineScanner.Scan() { - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - - // TODO: extract a helper function - eof := false - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } - if eof { - break - } + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } - line := lineScanner.Text() + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) - inputLineNumber++ + reader.inputLineNumber++ // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputStringList(line+"\n", context) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data } - // else comments are data } // This is how to do a chomp: line = strings.TrimRight(line, reader.readerOptions.IRS) - // xxx temp pending autodetect, and pending more windows-port work line = strings.TrimRight(line, "\r") if line == "" { // Reset to new schema - headerStrings = nil + reader.headerStrings = nil continue } var fields []string + // TODO: function-pointer this if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex fields = lib.SplitString(line, reader.readerOptions.IFS) } else { @@ -326,19 +332,19 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( fields = lib.StripEmpties(fields) // left/right trim } - if headerStrings == nil { + if reader.headerStrings == nil { n := len(fields) - headerStrings = make([]string, n) + reader.headerStrings = make([]string, n) for i := 0; i < n; i++ { - headerStrings[i] = strconv.Itoa(i + 1) + reader.headerStrings[i] = strconv.Itoa(i + 1) } } else { - if !reader.readerOptions.AllowRaggedCSVInput && len(headerStrings) != len(fields) { + if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := errors.New( fmt.Sprintf( "mlr: CSV header/data length mismatch %d != %d "+ "at filename %s line %d.\n", - len(headerStrings), len(fields), filename, inputLineNumber, + len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ), ) errorChannel <- err @@ -350,16 +356,16 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( if !reader.readerOptions.AllowRaggedCSVInput { for i, field := range fields { value := types.MlrvalFromInferredTypeForDataFiles(field) - record.PutCopy(headerStrings[i], value) + record.PutCopy(reader.headerStrings[i], value) } } else { - nh := len(headerStrings) + nh := len(reader.headerStrings) nd := len(fields) n := lib.IntMin2(nh, nd) var i int for i = 0; i < n; i++ { value := types.MlrvalFromInferredTypeForDataFiles(fields[i]) - record.PutCopy(headerStrings[i], value) + record.PutCopy(reader.headerStrings[i], value) } if nh < nd { // if header shorter than data: use 1-up itoa keys @@ -370,16 +376,14 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader( if nh > nd { // if header longer than data: use "" values for i = nd; i < nh; i++ { - record.PutCopy(headerStrings[i], types.MLRVAL_VOID) + record.PutCopy(reader.headerStrings[i], types.MLRVAL_VOID) } } } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) - + recordsAndContexts.PushBack(types.NewRecordAndContextList(record, context)) } + + return recordsAndContexts, false } diff --git a/internal/pkg/input/record_reader_dkvp_nidx.go b/internal/pkg/input/record_reader_dkvp_nidx.go index 3672dedbd2..000b687534 100644 --- a/internal/pkg/input/record_reader_dkvp_nidx.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -99,7 +99,7 @@ func (reader *RecordReaderDKVPNIDX) processHandle( go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) for { - recordsAndContexts, eof := reader.getRecordBatch(linesChannel, recordsPerBatch, context) + recordsAndContexts, eof := reader.getRecordBatch(linesChannel, context) readerChannel <- recordsAndContexts if eof { break @@ -110,7 +110,6 @@ func (reader *RecordReaderDKVPNIDX) processHandle( // TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. func (reader *RecordReaderDKVPNIDX) getRecordBatch( linesChannel <-chan *list.List, - maxBatchSize int, context *types.Context, ) ( recordsAndContexts *list.List, diff --git a/todo.txt b/todo.txt index f144c4258e..cc0de9fc9b 100644 --- a/todo.txt +++ b/todo.txt @@ -43,6 +43,7 @@ PUNCHDOWN LIST ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc ? coalesce errchan & done-writing w/ Err to RAC, and close-chan *and* EOSMarker -- ? + - docnote JSON (a) no channelization opportunity (unless missed); (b) no JIT type-inference. therefore slower. - funcptr away the ifs/ifsregex check in record-readers - checklist channelize all record-reader types - do and maybe keep? record-reader return (raclist, err) & refactor repl accordingly From 84035491491d68ad9ccef2b6e07b54b9b4713411 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 10 Dec 2021 23:25:14 -0500 Subject: [PATCH 20/28] channelize XTAB reader --- internal/pkg/input/record_reader.go | 5 +- internal/pkg/input/record_reader_xtab.go | 184 ++++++++++++++++------- todo.txt | 4 + 3 files changed, 138 insertions(+), 55 deletions(-) diff --git a/internal/pkg/input/record_reader.go b/internal/pkg/input/record_reader.go index 8fbe42b537..f8a140ae1c 100644 --- a/internal/pkg/input/record_reader.go +++ b/internal/pkg/input/record_reader.go @@ -78,7 +78,10 @@ func NewLineScanner(handle io.Reader, irs string) *bufio.Scanner { return scanner } -// TODO: comment +// TODO: comment copiously +// +// Lines are written to the channel with their trailing newline (or whatever +// IRS) stripped off. So, callers get "a=1,b=2" rather than "a=1,b=2\n". func channelizedLineScanner( lineScanner *bufio.Scanner, linesChannel chan<- *list.List, diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 6b0f629d65..e24fd27659 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -1,6 +1,7 @@ package input import ( + "bufio" "container/list" "errors" "io" @@ -17,7 +18,6 @@ type RecordReaderXTAB struct { // Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored } -// ---------------------------------------------------------------- func NewRecordReaderXTAB( readerOptions *cli.TReaderOptions, recordsPerBatch int, @@ -28,7 +28,6 @@ func NewRecordReaderXTAB( }, nil } -// ---------------------------------------------------------------- func (reader *RecordReaderXTAB) Read( filenames []string, context types.Context, @@ -76,83 +75,160 @@ func (reader *RecordReaderXTAB) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) + recordsPerBatch := reader.readerOptions.RecordsPerBatch + // XTAB uses repeated IFS, rather than IRS, to delimit records lineScanner := NewLineScanner(handle, reader.readerOptions.IFS) - linesForRecord := list.New() + stanzasChannel := make(chan *list.List, recordsPerBatch) + go channelizedStanzaScanner(lineScanner, stanzasChannel, downstreamDoneChannel, recordsPerBatch) - eof := false - for !eof { - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } + for { + recordsAndContexts, eof := reader.getRecordBatch(stanzasChannel, context, errorChannel) + readerChannel <- recordsAndContexts if eof { break } + } +} + +// Given input like +// +// a 1 +// b 2 +// c 3 +// +// a 4 +// b 5 +// c 6 +// +// this function reads the input stream a line at a time, then produces +// string-lists one per stanza where a stanza is delimited by blank line, or +// start or end of file. A single stanza, once parsed, will become a single +// record. +func channelizedStanzaScanner( + lineScanner *bufio.Scanner, + stanzasChannel chan<- *list.List, // list of list of string + downstreamDoneChannel <-chan bool, // for mlr head + recordsPerBatch int, +) { + numStanzasSeen := 0 + inStanza := false + done := false - if !lineScanner.Scan() { + stanzas := list.New() + stanza := list.New() - if linesForRecord.Len() > 0 { - record, err := reader.recordFromXTABLines(linesForRecord) - if err != nil { - errorChannel <- err - return - } - context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList(record, context) - linesForRecord = list.New() + for lineScanner.Scan() { + line := lineScanner.Text() + if line == "" { + // Empty-line handling: + // 1. First empty line(s) in the stream are ignored. + // 2. After that, one or more empty lines separate records. + // 3. At end of file, multiple empty lines are ignored. + if inStanza { + inStanza = false + stanzas.PushBack(stanza) + numStanzasSeen++ + stanza = list.New() + } else { + continue } + } else { + if !inStanza { + inStanza = true + } + stanza.PushBack(line) + } + // See if downstream processors will be ignoring further data (e.g. mlr + // head). If so, stop reading. This makes 'mlr head hugefile' exit + // quickly, as it should. + if numStanzasSeen%recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + stanzasChannel <- stanzas + stanzas = list.New() + } + + if done { break } + } - line := lineScanner.Text() + // The last stanza may not have a trailing newline after it. Any lines in the stanza + // at this point will form the final record in the stream. + if stanza.Len() > 0 { + stanzas.PushBack(stanza) + } - // Check for comments-in-data feature - if strings.HasPrefix(line, reader.readerOptions.CommentString) { - if reader.readerOptions.CommentHandling == cli.PassComments { - readerChannel <- types.NewOutputStringList(line+reader.readerOptions.IFS, context) - continue - } else if reader.readerOptions.CommentHandling == cli.SkipComments { - continue - } - // else comments are data - } + stanzasChannel <- stanzas + close(stanzasChannel) // end-of-stream marker +} - if line != "" { - linesForRecord.PushBack(line) +// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile. +func (reader *RecordReaderXTAB) getRecordBatch( + stanzasChannel <-chan *list.List, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() - } else { - if linesForRecord.Len() > 0 { - record, err := reader.recordFromXTABLines(linesForRecord) - if err != nil { - errorChannel <- err - return - } - context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList(record, context) - linesForRecord = list.New() - } + stanzas, more := <-stanzasChannel + if !more { + return recordsAndContexts, true + } + + for e := stanzas.Front(); e != nil; e = e.Next() { + stanza := e.Value.(*list.List) + + // // TODO: move + // // Check for comments-in-data feature + // // TODO: function-pointer this away + // if reader.readerOptions.CommentHandling != cli.CommentsAreData { + // if strings.HasPrefix(line, reader.readerOptions.CommentString) { + // if reader.readerOptions.CommentHandling == cli.PassComments { + // recordsAndContexts.PushBack(types.NewOutputString(line+reader.readerOptions.IFS, context)) + // continue + // } else if reader.readerOptions.CommentHandling == cli.SkipComments { + // continue + // } + // // else comments are data + // } + // } + + lib.InternalCodingErrorIf(stanza.Len() == 0) + + record, err := reader.recordFromXTABLines(stanza) + if err != nil { + errorChannel <- err + return } + context.UpdateForInputRecord() + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } + + return recordsAndContexts, false } -// ---------------------------------------------------------------- func (reader *RecordReaderXTAB) recordFromXTABLines( - lines *list.List, + stanza *list.List, ) (*types.Mlrmap, error) { record := types.NewMlrmapAsRecord() - for entry := lines.Front(); entry != nil; entry = entry.Next() { - line := entry.Value.(string) + for e := stanza.Front(); e != nil; e = e.Next() { + line := e.Value.(string) var kv []string if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex diff --git a/todo.txt b/todo.txt index cc0de9fc9b..1cb7936e02 100644 --- a/todo.txt +++ b/todo.txt @@ -189,6 +189,10 @@ PUNCHDOWN LIST ================================================================ NON-BLOCKERS +* try simpler-than-regex-split-string for repeated-single -- especially for XTAB reader + +* UT-per-se of XTAB channelizedStanzaScanner + * main-level (verb-level?) flag for "," -> X in verbs -- in case commas in field names * golinter From f47b2dd2f06301a2b446e6d11c38ca0b745f3e68 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 10 Dec 2021 23:55:33 -0500 Subject: [PATCH 21/28] batchify JSON reader --- internal/pkg/input/record_reader_json.go | 54 +++++++++++++++--------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/internal/pkg/input/record_reader_json.go b/internal/pkg/input/record_reader_json.go index e805a9dbf9..5341b12107 100644 --- a/internal/pkg/input/record_reader_json.go +++ b/internal/pkg/input/record_reader_json.go @@ -77,27 +77,34 @@ func (reader *RecordReaderJSON) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) + // TODO: comment + recordsPerBatch := reader.readerOptions.RecordsPerBatch if reader.readerOptions.CommentHandling != cli.CommentsAreData { handle = NewJSONCommentEnabledReader(handle, reader.readerOptions, readerChannel) } decoder := json.NewDecoder(handle) + recordsAndContexts := list.New() eof := false + i := 0 for { - // See if downstream processors will be ignoring further data (e.g. mlr // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } - if eof { - break + // quickly, as it should. Do this channel-check every so often to avoid + // scheduler overhead. + i++ + if i%recordsPerBatch == 0 { + select { + case _ = <-downstreamDoneChannel: + eof = true + break + default: + break + } + if eof { + break + } } mlrval, eof, err := types.MlrvalDecodeFromJSON(decoder) @@ -122,16 +129,20 @@ func (reader *RecordReaderJSON) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) + + if recordsAndContexts.Len() >= recordsPerBatch { + readerChannel <- recordsAndContexts + recordsAndContexts = list.New() + } + } else if mlrval.IsArray() { records := mlrval.GetArray() if records == nil { errorChannel <- errors.New("Internal coding error detected in JSON record-reader") return } + for _, mlrval := range records { if !mlrval.IsMap() { // TODO: more context @@ -149,11 +160,12 @@ func (reader *RecordReaderJSON) processHandle( return } context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) + if recordsAndContexts.Len() >= recordsPerBatch { + readerChannel <- recordsAndContexts + recordsAndContexts = list.New() + } } } else { @@ -166,6 +178,10 @@ func (reader *RecordReaderJSON) processHandle( return } } + + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } } // ================================================================ From e24a36da6334d0dd07d55878b7eadf328143ab3d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 11 Dec 2021 00:13:35 -0500 Subject: [PATCH 22/28] channelize GEN pseudo-reader --- internal/pkg/input/pseudo_reader_gen.go | 50 +++++++++++++-------- internal/pkg/input/record_reader_csvlite.go | 13 ++++-- internal/pkg/input/record_reader_json.go | 4 +- internal/pkg/output/file-output-handlers.go | 9 ++-- internal/pkg/types/context.go | 20 --------- todo.txt | 3 ++ 6 files changed, 50 insertions(+), 49 deletions(-) diff --git a/internal/pkg/input/pseudo_reader_gen.go b/internal/pkg/input/pseudo_reader_gen.go index e7150df841..9dc2a14ca3 100644 --- a/internal/pkg/input/pseudo_reader_gen.go +++ b/internal/pkg/input/pseudo_reader_gen.go @@ -42,6 +42,7 @@ func (reader *PseudoReaderGen) process( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile("(gen-pseudo-reader)") + recordsPerBatch := reader.readerOptions.RecordsPerBatch start, err := reader.tryParse("start", reader.readerOptions.GeneratorOptions.StartAsString) if err != nil { @@ -69,24 +70,11 @@ func (reader *PseudoReaderGen) process( key := reader.readerOptions.GeneratorOptions.FieldName value := start.Copy() + recordsAndContexts := list.New() + eof := false for !eof { - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - eof := false - select { - case _ = <-downstreamDoneChannel: - eof = true - break - default: - break - } - if eof { - break - } - mdone := doneComparator(value, stop) done, _ := mdone.GetBoolValue() if done { @@ -97,13 +85,37 @@ func (reader *PseudoReaderGen) process( record.PutCopy(key, value) context.UpdateForInputRecord() - readerChannel <- types.NewRecordAndContextList( - record, - context, - ) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) + + if recordsAndContexts.Len() >= recordsPerBatch { + readerChannel <- recordsAndContexts + recordsAndContexts = list.New() + + // See if downstream processors will be ignoring further data (e.g. + // mlr head). If so, stop reading. This makes 'mlr head hugefile' + // exit quickly, as it should. Check this only every so often to + // avoid goroutine-scheduler thrash. + eof := false + select { + case _ = <-downstreamDoneChannel: + eof = true + break + default: + break + } + if eof { + break + } + + } value = types.BIF_plus_binary(value, step) } + + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + recordsAndContexts = list.New() + } } func (reader *PseudoReaderGen) tryParse( diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index b32e591614..ccf5aafc7c 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -67,7 +67,6 @@ func NewRecordReaderCSVLite( } else { reader.recordBatchGetter = getRecordBatchExplicitCSVHeader } - return reader, nil } @@ -75,10 +74,16 @@ func NewRecordReaderPPRINT( readerOptions *cli.TReaderOptions, recordsPerBatch int, ) (*RecordReaderCSVLite, error) { - return &RecordReaderCSVLite{ + reader := &RecordReaderCSVLite{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - }, nil + } + if reader.readerOptions.UseImplicitCSVHeader { + reader.recordBatchGetter = getRecordBatchImplicitCSVHeader + } else { + reader.recordBatchGetter = getRecordBatchExplicitCSVHeader + } + return reader, nil } func (reader *RecordReaderCSVLite) Read( @@ -382,7 +387,7 @@ func getRecordBatchImplicitCSVHeader( } context.UpdateForInputRecord() - recordsAndContexts.PushBack(types.NewRecordAndContextList(record, context)) + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } return recordsAndContexts, false diff --git a/internal/pkg/input/record_reader_json.go b/internal/pkg/input/record_reader_json.go index 5341b12107..ce4eaab090 100644 --- a/internal/pkg/input/record_reader_json.go +++ b/internal/pkg/input/record_reader_json.go @@ -256,7 +256,9 @@ func (bsr *JSONCommentEnabledReader) Read(p []byte) (n int, err error) { if bsr.readerOptions.CommentHandling == cli.PassComments { // Insert the string into the record-output stream, so that goroutine can // print it, resulting in deterministic output-ordering. - bsr.readerChannel <- types.NewOutputStringList(line+"\n", bsr.context) + ell := list.New() + ell.PushBack(types.NewOutputString(line+"\n", bsr.context)) + bsr.readerChannel <- ell } } } diff --git a/internal/pkg/output/file-output-handlers.go b/internal/pkg/output/file-output-handlers.go index 3991726c7f..d187c9985e 100644 --- a/internal/pkg/output/file-output-handlers.go +++ b/internal/pkg/output/file-output-handlers.go @@ -340,11 +340,10 @@ func (handler *FileOutputHandler) WriteRecordAndContext( } } - // TODO: mahbe refactor to batch better - handler.recordOutputChannel <- types.NewRecordAndContextList( - outrecAndContext.Record, - &outrecAndContext.Context, - ) + // TODO: myybe refactor to batch better + ell := list.New() + ell.PushBack(outrecAndContext) + handler.recordOutputChannel <- ell return nil } diff --git a/internal/pkg/types/context.go b/internal/pkg/types/context.go index 3a9ac59706..1aff938114 100644 --- a/internal/pkg/types/context.go +++ b/internal/pkg/types/context.go @@ -38,16 +38,6 @@ func NewRecordAndContext( } } -// TODO: temp for batch-reader refactor -func NewRecordAndContextList( - record *Mlrmap, - context *Context, -) *list.List { - ell := list.New() - ell.PushBack(NewRecordAndContext(record, context)) - return ell -} - // For the record-readers to update their initial context as each new record is read. func (rac *RecordAndContext) Copy() *RecordAndContext { if rac == nil { @@ -80,16 +70,6 @@ func NewOutputString( } } -// TODO: temp for batch-reader refactor -func NewOutputStringList( - outputString string, - context *Context, -) *list.List { - ell := list.New() - ell.PushBack(NewOutputString(outputString, context)) - return ell -} - // For the record-readers to update their initial context as each new record is read. func NewEndOfStreamMarker(context *Context) *RecordAndContext { return &RecordAndContext{ diff --git a/todo.txt b/todo.txt index 1cb7936e02..b1b09a227d 100644 --- a/todo.txt +++ b/todo.txt @@ -3,6 +3,8 @@ PUNCHDOWN LIST ! mlr --csv sort -f shape $mlds/example.csv +* big-picture item @ Rmd; also webdoc intro page + * perf wup @ rgp.md * perf: o go tool pprof -http=:8080 cpu.pprof @@ -39,6 +41,7 @@ PUNCHDOWN LIST > everything else - reads and writes a remove NewOutputStringList and NewRecordAndContextList + - JSON reader pass raclist to comment-scanner for orderly injection ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc From 34627e72a5deea7b2c4fea01a8929681ab1bb5c6 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 12 Dec 2021 12:00:16 -0500 Subject: [PATCH 23/28] scripts for perf-testing on larger files --- internal/pkg/input/record_reader_dkvp_nidx.go | 8 +-- scripts/chain-1.mlr | 2 + scripts/chain-cmps.sh | 41 +++++++++++++++ scripts/chain-lengths.sh | 50 +++++++++++++++++++ scripts/goslow.sh | 6 +++ scripts/goslower.sh | 6 +++ scripts/make-big-files | 23 +++++++++ scripts/make-data-stream | 32 ++++++++++++ scripts/time-big-file | 18 +++++++ scripts/time-big-files | 26 ++++++++++ test/cases/dsl-array-map-indexing/0081/expout | 22 ++++---- todo.txt | 10 ++-- 12 files changed, 226 insertions(+), 18 deletions(-) create mode 100644 scripts/chain-1.mlr create mode 100755 scripts/chain-cmps.sh create mode 100755 scripts/chain-lengths.sh create mode 100755 scripts/goslow.sh create mode 100755 scripts/goslower.sh create mode 100755 scripts/make-big-files create mode 100755 scripts/make-data-stream create mode 100755 scripts/time-big-file create mode 100755 scripts/time-big-files diff --git a/internal/pkg/input/record_reader_dkvp_nidx.go b/internal/pkg/input/record_reader_dkvp_nidx.go index 000b687534..c88d95ae01 100644 --- a/internal/pkg/input/record_reader_dkvp_nidx.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -15,12 +15,12 @@ import ( // splitter_DKVP_NIDX is a function type for the one bit of code differing // between the DKVP reader and the NIDX reader, namely, how it splits lines. -type splitter_DKVP_NIDX func (reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap +type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap type RecordReaderDKVPNIDX struct { readerOptions *cli.TReaderOptions recordsPerBatch int - splitter splitter_DKVP_NIDX + splitter splitter_DKVP_NIDX } func NewRecordReaderDKVP( @@ -30,7 +30,7 @@ func NewRecordReaderDKVP( return &RecordReaderDKVPNIDX{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - splitter: recordFromDKVPLine, + splitter: recordFromDKVPLine, }, nil } @@ -41,7 +41,7 @@ func NewRecordReaderNIDX( return &RecordReaderDKVPNIDX{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - splitter: recordFromNIDXLine, + splitter: recordFromNIDXLine, }, nil } diff --git a/scripts/chain-1.mlr b/scripts/chain-1.mlr new file mode 100644 index 0000000000..c2279799bc --- /dev/null +++ b/scripts/chain-1.mlr @@ -0,0 +1,2 @@ +$color_shape = $color . $shape; +$y = int($k) + int($index) **3 + log10(float($quantity)/float($rate)); diff --git a/scripts/chain-cmps.sh b/scripts/chain-cmps.sh new file mode 100755 index 0000000000..d818c88974 --- /dev/null +++ b/scripts/chain-cmps.sh @@ -0,0 +1,41 @@ +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + check \ + | md5sum +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + cat \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + head \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + tail \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + tac \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + sort -f shape \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + sort -n quantity \ + | md5sum; +done diff --git a/scripts/chain-lengths.sh b/scripts/chain-lengths.sh new file mode 100755 index 0000000000..b6d2b61227 --- /dev/null +++ b/scripts/chain-lengths.sh @@ -0,0 +1,50 @@ +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done + +echo; for m in mlr5 "./mlr -S"; do + justtime $m --csv --from ~/tmp/big.csv \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + then put -f scripts/chain-1.mlr \ + | md5sum; +done diff --git a/scripts/goslow.sh b/scripts/goslow.sh new file mode 100755 index 0000000000..dfa63f552d --- /dev/null +++ b/scripts/goslow.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cat "$@" | while read line; do + millisleep 100 + echo $line +done diff --git a/scripts/goslower.sh b/scripts/goslower.sh new file mode 100755 index 0000000000..37f2300478 --- /dev/null +++ b/scripts/goslower.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cat "$@" | while read line; do + sleep 1 + echo $line +done diff --git a/scripts/make-big-files b/scripts/make-big-files new file mode 100755 index 0000000000..e23d2fcb37 --- /dev/null +++ b/scripts/make-big-files @@ -0,0 +1,23 @@ +#!/bin/bash + +set -x + +mkdir ~/tmp/ + +mlr --csv \ + repeat -n 100000 \ + then shuffle \ + then put ' + begin{@index=1} + $k = NR; + @index += urandint(2,10); + $index=@index; + $quantity=fmtnum(urandrange(50,100),"%.4f"); + $rate=fmtnum(urandrange(1,10),"%.4f"); + ' \ +docs/src/example.csv > ~/tmp/big.csv + +mlr --c2d cat ~/tmp/big.csv > ~/tmp/big.dkvp +mlr --c2j cat ~/tmp/big.csv > ~/tmp/big.json +mlr --c2n cat ~/tmp/big.csv > ~/tmp/big.nidx +mlr --c2x cat ~/tmp/big.csv > ~/tmp/big.xtab diff --git a/scripts/make-data-stream b/scripts/make-data-stream new file mode 100755 index 0000000000..d29a37b8c3 --- /dev/null +++ b/scripts/make-data-stream @@ -0,0 +1,32 @@ +stop=1000000000 +profile="" + +#stop=1000000 +#profile="cpuprofile cpu.pprof" + +mlr \ + $profile \ + --ocsv \ + --igen --gen-stop $stop \ + put ' + begin { + @colors=["red","purple","yellow","green","blue","orange"]; + @shapes=["triangle","square","circle","pentagon","hexagon"]; + @index = 1; + } + + $color = @colors[urandint(1,length(@colors))]; + $shape = @shapes[urandint(1,length(@shapes))]; + $flag = (urand32() < 0.6) ? "true" : "false"; + $index = @index; + $quantity=fmtnum(urandrange(50,100),"%.4f"); + $rate=fmtnum(urandrange(1,10),"%.4f"); + + @index += urandint(2,10); + ' \ + then filter '$quantity > 60.0' \ + then put '$y = $k + $index**2 + log10($rate/$quantity)' \ + then rename i,k \ + then cut -xf index \ + then filter '$rate > 2' + diff --git a/scripts/time-big-file b/scripts/time-big-file new file mode 100755 index 0000000000..5da24aa787 --- /dev/null +++ b/scripts/time-big-file @@ -0,0 +1,18 @@ +#!/bin/bash + +mlr="mlr -S" +suffix="dkvp" + +iflag="" +if [ $# -ge 1 ]; then + iflag="--$1" + if [ "$iflag" = "--csvlite" ]; then + suffix="csv" + else + suffix=$1 + fi +fi +if [ $# -eq 2 ]; then + mlr="$2" +fi +justtime $mlr $iflag cat ~/tmp/big.$suffix | md5sum - diff --git a/scripts/time-big-files b/scripts/time-big-files new file mode 100755 index 0000000000..3de09bf1c2 --- /dev/null +++ b/scripts/time-big-files @@ -0,0 +1,26 @@ +#!/bin/bash + +ourdir=$(dirname $0) + +$ourdir/time-big-file dkvp mlr5 +$ourdir/time-big-file dkvp 'mlr -S' +echo + +$ourdir/time-big-file nidx mlr5 +$ourdir/time-big-file nidx 'mlr -S' +echo + +$ourdir/time-big-file xtab mlr5 +$ourdir/time-big-file xtab 'mlr -S' +echo + +$ourdir/time-big-file csv mlr5 +$ourdir/time-big-file csv 'mlr -S' +echo + +$ourdir/time-big-file csvlite mlr5 +$ourdir/time-big-file csvlite 'mlr -S' +echo + +$ourdir/time-big-file json mlr5 +$ourdir/time-big-file json 'mlr -S' diff --git a/test/cases/dsl-array-map-indexing/0081/expout b/test/cases/dsl-array-map-indexing/0081/expout index efdc88ddf0..adb570d8ac 100644 --- a/test/cases/dsl-array-map-indexing/0081/expout +++ b/test/cases/dsl-array-map-indexing/0081/expout @@ -1,11 +1,11 @@ -i x y -1 0.3467901443380824 0.7268028627434533 -2 0.7586799647899636 0.5221511083334797 -3 0.20460330576630303 0.33831852551664776 -4 0.38139939387114097 0.13418874328430463 -5 0.5732889198020006 0.8636244699032729 -6 0.5271261600918548 0.49322128674835697 -7 0.6117840605678454 0.1878849191181694 -8 0.5985540091064224 0.976181385699006 -9 0.03144187646093577 0.7495507603507059 -10 0.5026260055412137 0.9526183602969864 +b i x y +pan 1 0.3467901443380824 0.7268028627434533 +eks 2 0.7586799647899636 0.5221511083334797 +wye 3 0.20460330576630303 0.33831852551664776 +eks 4 0.38139939387114097 0.13418874328430463 +wye 5 0.5732889198020006 0.8636244699032729 +zee 6 0.5271261600918548 0.49322128674835697 +eks 7 0.6117840605678454 0.1878849191181694 +zee 8 0.5985540091064224 0.976181385699006 +hat 9 0.03144187646093577 0.7495507603507059 +pan 10 0.5026260055412137 0.9526183602969864 diff --git a/todo.txt b/todo.txt index b1b09a227d..f4f8a610fa 100644 --- a/todo.txt +++ b/todo.txt @@ -1,10 +1,15 @@ ================================================================ PUNCHDOWN LIST -! mlr --csv sort -f shape $mlds/example.csv * big-picture item @ Rmd; also webdoc intro page +* mr -S + test/cases/io-skip-pass-comments + test/cases/repl + test/cases/verb-join + test/cases/verb-join-mixed-format + * perf wup @ rgp.md * perf: o go tool pprof -http=:8080 cpu.pprof @@ -30,6 +35,7 @@ PUNCHDOWN LIST - make a 2nd/3rd cmd main w/ simple model & tweak that o dkvp-reader factor-out ... o mods: + - randsel for arrays - downstreamDone batchify > head - writes @@ -40,9 +46,7 @@ PUNCHDOWN LIST - reads > everything else - reads and writes - a remove NewOutputStringList and NewRecordAndContextList - JSON reader pass raclist to comment-scanner for orderly injection - ? outputChannel -> *list.List at each transformer -- ? profile first ? readerChannel length 1 or 2 ? ? experiment again with hashed/unhashed -- mlr sort etc ? coalesce errchan & done-writing w/ Err to RAC, and close-chan *and* EOSMarker -- ? From 4aab213ecdc1f73f93e3e0871997ef8cc0bc5d76 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 12 Dec 2021 23:19:48 -0500 Subject: [PATCH 24/28] merge with main for #776 --- internal/pkg/input/record_reader_dkvp_nidx.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/pkg/input/record_reader_dkvp_nidx.go b/internal/pkg/input/record_reader_dkvp_nidx.go index c88d95ae01..73c34e7d51 100644 --- a/internal/pkg/input/record_reader_dkvp_nidx.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -158,6 +158,9 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap } else { pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) } + if reader.readerOptions.AllowRepeatIFS { + pairs = lib.StripEmpties(pairs) // left/right trim + } for i, pair := range pairs { var kv []string @@ -195,7 +198,6 @@ func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) *types.Mlrmap } else { values = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) } - if reader.readerOptions.AllowRepeatIFS { values = lib.StripEmpties(values) // left/right trim } From 056b8e973f122fdc07087047fc5410bad4284a01 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 13 Dec 2021 00:10:35 -0500 Subject: [PATCH 25/28] Fix record-batching for join and repl --- internal/pkg/input/pseudo_reader_gen.go | 4 +- internal/pkg/input/record_reader_csv.go | 8 ++-- internal/pkg/input/record_reader_csvlite.go | 8 ++-- internal/pkg/input/record_reader_dkvp_nidx.go | 8 ++-- internal/pkg/input/record_reader_json.go | 4 +- internal/pkg/input/record_reader_xtab.go | 40 ++++++++++--------- .../transformers/utils/join-bucket-keeper.go | 2 +- test/cases/io-skip-pass-comments/0022/expout | 2 +- test/cases/io-skip-pass-comments/0024/expout | 2 +- todo.txt | 4 +- 10 files changed, 45 insertions(+), 37 deletions(-) diff --git a/internal/pkg/input/pseudo_reader_gen.go b/internal/pkg/input/pseudo_reader_gen.go index 9dc2a14ca3..143b2f5e92 100644 --- a/internal/pkg/input/pseudo_reader_gen.go +++ b/internal/pkg/input/pseudo_reader_gen.go @@ -11,7 +11,7 @@ import ( type PseudoReaderGen struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl } func NewPseudoReaderGen( @@ -42,7 +42,7 @@ func (reader *PseudoReaderGen) process( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile("(gen-pseudo-reader)") - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch start, err := reader.tryParse("start", reader.readerOptions.GeneratorOptions.StartAsString) if err != nil { diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index 1c4bbf5d40..a961e057c0 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -18,7 +18,7 @@ import ( // ---------------------------------------------------------------- type RecordReaderCSV struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl ifs0 byte // Go's CSV library only lets its 'Comma' be a single character filename string @@ -93,7 +93,7 @@ func (reader *RecordReaderCSV) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch // Reset state for start of next input file reader.filename = filename @@ -109,7 +109,9 @@ func (reader *RecordReaderCSV) processHandle( for { recordsAndContexts, eof := reader.getRecordBatch(csvRecordsChannel, errorChannel, context) - readerChannel <- recordsAndContexts + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } if eof { break } diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index ccf5aafc7c..b9f1c65d2f 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -46,7 +46,7 @@ type recordBatchGetterCSV func( type RecordReaderCSVLite struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl recordBatchGetter recordBatchGetterCSV @@ -151,14 +151,16 @@ func (reader *RecordReaderCSVLite) processHandle( reader.inputLineNumber = 0 reader.headerStrings = nil - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) linesChannel := make(chan *list.List, recordsPerBatch) go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) for { recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel) - readerChannel <- recordsAndContexts + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } if eof { break } diff --git a/internal/pkg/input/record_reader_dkvp_nidx.go b/internal/pkg/input/record_reader_dkvp_nidx.go index 73c34e7d51..c2e3140ede 100644 --- a/internal/pkg/input/record_reader_dkvp_nidx.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -19,7 +19,7 @@ type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) *types.M type RecordReaderDKVPNIDX struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl splitter splitter_DKVP_NIDX } @@ -92,7 +92,7 @@ func (reader *RecordReaderDKVPNIDX) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) linesChannel := make(chan *list.List, recordsPerBatch) @@ -100,7 +100,9 @@ func (reader *RecordReaderDKVPNIDX) processHandle( for { recordsAndContexts, eof := reader.getRecordBatch(linesChannel, context) - readerChannel <- recordsAndContexts + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } if eof { break } diff --git a/internal/pkg/input/record_reader_json.go b/internal/pkg/input/record_reader_json.go index ce4eaab090..13f76222e9 100644 --- a/internal/pkg/input/record_reader_json.go +++ b/internal/pkg/input/record_reader_json.go @@ -17,7 +17,7 @@ import ( type RecordReaderJSON struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl } func NewRecordReaderJSON( @@ -78,7 +78,7 @@ func (reader *RecordReaderJSON) processHandle( ) { context.UpdateForStartOfFile(filename) // TODO: comment - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch if reader.readerOptions.CommentHandling != cli.CommentsAreData { handle = NewJSONCommentEnabledReader(handle, reader.readerOptions, readerChannel) diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index e24fd27659..4843cebd25 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -14,7 +14,8 @@ import ( type RecordReaderXTAB struct { readerOptions *cli.TReaderOptions - recordsPerBatch int + recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl + // Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored } @@ -75,7 +76,7 @@ func (reader *RecordReaderXTAB) processHandle( downstreamDoneChannel <-chan bool, // for mlr head ) { context.UpdateForStartOfFile(filename) - recordsPerBatch := reader.readerOptions.RecordsPerBatch + recordsPerBatch := reader.recordsPerBatch // XTAB uses repeated IFS, rather than IRS, to delimit records lineScanner := NewLineScanner(handle, reader.readerOptions.IFS) @@ -85,7 +86,9 @@ func (reader *RecordReaderXTAB) processHandle( for { recordsAndContexts, eof := reader.getRecordBatch(stanzasChannel, context, errorChannel) - readerChannel <- recordsAndContexts + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } if eof { break } @@ -121,6 +124,22 @@ func channelizedStanzaScanner( for lineScanner.Scan() { line := lineScanner.Text() + + // TODO: stanzas should pair data-list and comment-list ... + // Check for comments-in-data feature + // TODO: function-pointer this away + // if reader.readerOptions.CommentHandling != cli.CommentsAreData { + // if strings.HasPrefix(line, reader.readerOptions.CommentString) { + // if reader.readerOptions.CommentHandling == cli.PassComments { + // recordsAndContexts.PushBack(types.NewOutputString(line+reader.readerOptions.IFS, context)) + // continue + // } else if reader.readerOptions.CommentHandling == cli.SkipComments { + // continue + // } + // // else comments are data + // } + // } + if line == "" { // Empty-line handling: // 1. First empty line(s) in the stream are ignored. @@ -193,21 +212,6 @@ func (reader *RecordReaderXTAB) getRecordBatch( for e := stanzas.Front(); e != nil; e = e.Next() { stanza := e.Value.(*list.List) - // // TODO: move - // // Check for comments-in-data feature - // // TODO: function-pointer this away - // if reader.readerOptions.CommentHandling != cli.CommentsAreData { - // if strings.HasPrefix(line, reader.readerOptions.CommentString) { - // if reader.readerOptions.CommentHandling == cli.PassComments { - // recordsAndContexts.PushBack(types.NewOutputString(line+reader.readerOptions.IFS, context)) - // continue - // } else if reader.readerOptions.CommentHandling == cli.SkipComments { - // continue - // } - // // else comments are data - // } - // } - lib.InternalCodingErrorIf(stanza.Len() == 0) record, err := reader.recordFromXTABLines(stanza) diff --git a/internal/pkg/transformers/utils/join-bucket-keeper.go b/internal/pkg/transformers/utils/join-bucket-keeper.go index 5132ec2a7c..c5be1253bc 100644 --- a/internal/pkg/transformers/utils/join-bucket-keeper.go +++ b/internal/pkg/transformers/utils/join-bucket-keeper.go @@ -178,7 +178,7 @@ func NewJoinBucketKeeper( initialContext.UpdateForStartOfFile(leftFileName) // Set up channels for the record-reader - readerChannel := make(chan *list.List, 10) // list of *types.RecordAndContext + readerChannel := make(chan *list.List, 2) // list of *types.RecordAndContext errorChannel := make(chan error, 1) downstreamDoneChannel := make(chan bool, 1) diff --git a/test/cases/io-skip-pass-comments/0022/expout b/test/cases/io-skip-pass-comments/0022/expout index b13f182fb3..2488533442 100644 --- a/test/cases/io-skip-pass-comments/0022/expout +++ b/test/cases/io-skip-pass-comments/0022/expout @@ -1,3 +1,3 @@ -a=1,b=2,c=3 # hello world 2 +a=1,b=2,c=3 a=4,b=5,c=6 diff --git a/test/cases/io-skip-pass-comments/0024/expout b/test/cases/io-skip-pass-comments/0024/expout index 23c5e97c80..5afb11391c 100644 --- a/test/cases/io-skip-pass-comments/0024/expout +++ b/test/cases/io-skip-pass-comments/0024/expout @@ -1,3 +1,3 @@ +# hello world 3 a=1,b=2,c=3 a=4,b=5,c=6 -# hello world 3 diff --git a/todo.txt b/todo.txt index f4f8a610fa..0832f14c28 100644 --- a/todo.txt +++ b/todo.txt @@ -5,10 +5,8 @@ PUNCHDOWN LIST * big-picture item @ Rmd; also webdoc intro page * mr -S + test/cases/dsl-array-map-indexing test/cases/io-skip-pass-comments - test/cases/repl - test/cases/verb-join - test/cases/verb-join-mixed-format * perf wup @ rgp.md * perf: From 8cea6ae70a0e7a24d905d2c8bf00ee7ee50465c7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 13 Dec 2021 00:23:24 -0500 Subject: [PATCH 26/28] Fix comment-handling in channelized XTAB reader --- internal/pkg/input/record_reader_xtab.go | 76 ++++++++++++++++-------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 4843cebd25..8213916ae9 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -19,6 +19,24 @@ type RecordReaderXTAB struct { // Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored } +// tStanza is for the channelized reader which operates (for performance) in +// its own goroutine. An XTAB "stanza" is a collection of lines which will be +// parsed as a Miller record. Also for performance (to reduce +// goroutine-scheduler thrash) stanzas are delivered in batches (nominally max +// 500 or so). This struct helps us keep each stanza's comment lines along with +// the stanza they originated in. +type tStanza struct { + dataLines *list.List + commentLines *list.List +} + +func newStanza() *tStanza { + return &tStanza{ + dataLines: list.New(), + commentLines: list.New(), + } +} + func NewRecordReaderXTAB( readerOptions *cli.TReaderOptions, recordsPerBatch int, @@ -82,7 +100,8 @@ func (reader *RecordReaderXTAB) processHandle( lineScanner := NewLineScanner(handle, reader.readerOptions.IFS) stanzasChannel := make(chan *list.List, recordsPerBatch) - go channelizedStanzaScanner(lineScanner, stanzasChannel, downstreamDoneChannel, recordsPerBatch) + go channelizedStanzaScanner(lineScanner, reader.readerOptions, stanzasChannel, downstreamDoneChannel, + recordsPerBatch) for { recordsAndContexts, eof := reader.getRecordBatch(stanzasChannel, context, errorChannel) @@ -111,6 +130,7 @@ func (reader *RecordReaderXTAB) processHandle( // record. func channelizedStanzaScanner( lineScanner *bufio.Scanner, + readerOptions *cli.TReaderOptions, stanzasChannel chan<- *list.List, // list of list of string downstreamDoneChannel <-chan bool, // for mlr head recordsPerBatch int, @@ -120,25 +140,24 @@ func channelizedStanzaScanner( done := false stanzas := list.New() - stanza := list.New() + stanza := newStanza() for lineScanner.Scan() { line := lineScanner.Text() - // TODO: stanzas should pair data-list and comment-list ... // Check for comments-in-data feature // TODO: function-pointer this away - // if reader.readerOptions.CommentHandling != cli.CommentsAreData { - // if strings.HasPrefix(line, reader.readerOptions.CommentString) { - // if reader.readerOptions.CommentHandling == cli.PassComments { - // recordsAndContexts.PushBack(types.NewOutputString(line+reader.readerOptions.IFS, context)) - // continue - // } else if reader.readerOptions.CommentHandling == cli.SkipComments { - // continue - // } - // // else comments are data - // } - // } + if readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, readerOptions.CommentString) { + if readerOptions.CommentHandling == cli.PassComments { + stanza.commentLines.PushBack(line) + continue + } else if readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data + } + } if line == "" { // Empty-line handling: @@ -149,7 +168,7 @@ func channelizedStanzaScanner( inStanza = false stanzas.PushBack(stanza) numStanzasSeen++ - stanza = list.New() + stanza = newStanza() } else { continue } @@ -157,7 +176,7 @@ func channelizedStanzaScanner( if !inStanza { inStanza = true } - stanza.PushBack(line) + stanza.dataLines.PushBack(line) } // See if downstream processors will be ignoring further data (e.g. mlr @@ -185,7 +204,7 @@ func channelizedStanzaScanner( // The last stanza may not have a trailing newline after it. Any lines in the stanza // at this point will form the final record in the stream. - if stanza.Len() > 0 { + if stanza.dataLines.Len() > 0 || stanza.commentLines.Len() > 0 { stanzas.PushBack(stanza) } @@ -210,17 +229,24 @@ func (reader *RecordReaderXTAB) getRecordBatch( } for e := stanzas.Front(); e != nil; e = e.Next() { - stanza := e.Value.(*list.List) + stanza := e.Value.(*tStanza) - lib.InternalCodingErrorIf(stanza.Len() == 0) + if stanza.commentLines.Len() > 0 { + for f := stanza.commentLines.Front(); f != nil; f = f.Next() { + line := f.Value.(string) + recordsAndContexts.PushBack(types.NewOutputString(line+reader.readerOptions.IFS, context)) + } + } - record, err := reader.recordFromXTABLines(stanza) - if err != nil { - errorChannel <- err - return + if stanza.dataLines.Len() > 0 { + record, err := reader.recordFromXTABLines(stanza.dataLines) + if err != nil { + errorChannel <- err + return + } + context.UpdateForInputRecord() + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } - context.UpdateForInputRecord() - recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) } return recordsAndContexts, false From fe6d6cfd4e5c8518a060818e611ead164a4ede39 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 13 Dec 2021 00:42:51 -0500 Subject: [PATCH 27/28] Fix bug found in positional-rename --- internal/pkg/types/mlrmap_accessors.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/internal/pkg/types/mlrmap_accessors.go b/internal/pkg/types/mlrmap_accessors.go index b425c07f19..445d5865e2 100644 --- a/internal/pkg/types/mlrmap_accessors.go +++ b/internal/pkg/types/mlrmap_accessors.go @@ -296,7 +296,6 @@ func (mlrmap *Mlrmap) PutNameWithPositionalIndex(position int, name *Mlrval) { return } - // TODO: rekey the hashmap s := "" if name.mvtype == MT_STRING { s = name.printrep @@ -311,11 +310,18 @@ func (mlrmap *Mlrmap) PutNameWithPositionalIndex(position int, name *Mlrval) { // and the user does '$[[1]] = $[[2]]'. Then there would be two b's. mapEntry := mlrmap.findEntry(s) if mapEntry != nil && mapEntry != positionalEntry { + if mlrmap.keysToEntries != nil { + delete(mlrmap.keysToEntries, positionalEntry.Key) + } mlrmap.Unlink(mapEntry) } lib.InternalCodingErrorIf(s == "") positionalEntry.Key = s + + if mlrmap.keysToEntries != nil { + mlrmap.keysToEntries[s] = positionalEntry + } } // ---------------------------------------------------------------- From 447f1396a610213a34e18a3148ae2de4a055c5e0 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 14 Dec 2021 22:32:45 -0500 Subject: [PATCH 28/28] Use --no-hash-records by default --- docs/src/manpage.md | 14 +++++++++++++- docs/src/manpage.txt | 14 +++++++++++++- docs/src/reference-main-flag-list.md | 6 ++++++ internal/pkg/cli/option_parse.go | 8 ++++++-- internal/pkg/types/mlrmap.go | 2 +- man/manpage.txt | 14 +++++++++++++- man/mlr.1 | 16 ++++++++++++++-- 7 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/src/manpage.md b/docs/src/manpage.md index a4e1ae3a40..12aadf7c36 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -478,6 +478,14 @@ MISCELLANEOUS FLAGS rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. + --hash-records This is an internal parameter which normally does not + need to be modified. It controls the mechanism by + which Miller accesses fields within records. In + general --no-hash-records is faster, and is the + default. For specific use-cases involving data having + many fields, and many of them being processed during + a given processing run, --hash-records might offer a + slight performance benefit. --infer-int-as-float or -A Cast all integers in data files to floats. --infer-no-octal or -O Treat numbers like 0123 in data files as string @@ -508,12 +516,16 @@ MISCELLANEOUS FLAGS unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. + --no-hash-records See --hash-records. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. + --records-per-batch {n} This is an internal parameter for maximum number of + records in a batch size. Normally this does not need + to be modified. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -2994,5 +3006,5 @@ SEE ALSO - 2021-12-07 MILLER(1) + 2021-12-15 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 47f2a7d808..36a13164c6 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -457,6 +457,14 @@ MISCELLANEOUS FLAGS rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. + --hash-records This is an internal parameter which normally does not + need to be modified. It controls the mechanism by + which Miller accesses fields within records. In + general --no-hash-records is faster, and is the + default. For specific use-cases involving data having + many fields, and many of them being processed during + a given processing run, --hash-records might offer a + slight performance benefit. --infer-int-as-float or -A Cast all integers in data files to floats. --infer-no-octal or -O Treat numbers like 0123 in data files as string @@ -487,12 +495,16 @@ MISCELLANEOUS FLAGS unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. + --no-hash-records See --hash-records. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. + --records-per-batch {n} This is an internal parameter for maximum number of + records in a batch size. Normally this does not need + to be modified. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -2973,4 +2985,4 @@ SEE ALSO - 2021-12-07 MILLER(1) + 2021-12-15 MILLER(1) diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index 40e18cfbcd..3c4ce7ea24 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -341,6 +341,8 @@ These are flags which don't fit into any other category. `: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost. * `--from {filename} `: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. +* `--hash-records +`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit. * `--infer-int-as-float or -A `: Cast all integers in data files to floats. * `--infer-no-octal or -O @@ -355,10 +357,14 @@ These are flags which don't fit into any other category. `: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`. * `--no-fflush `: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. +* `--no-hash-records +`: See --hash-records. * `--nr-progress-mod {m} `: With m a positive integer: print filename and record count to os.Stderr every m input records. * `--ofmt {format} `: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. +* `--records-per-batch {n} +`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified. * `--seed {n} `: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. * `--tz {timezone} diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 1fd60d0ebf..839e0282a0 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -2578,7 +2578,11 @@ this does not need to be modified.`, { name: "--hash-records", - help: `This is an internal parameter which normally does not need to be modified.`, + help: `This is an internal parameter which normally does not need to be modified. +It controls the mechanism by which Miller accesses fields within records. +In general --no-hash-records is faster, and is the default. For specific use-cases involving +data having many fields, and many of them being processed during a given processing run, +--hash-records might offer a slight performance benefit.`, parser: func(args []string, argc int, pargi *int, options *TOptions) { types.HashRecords(true) *pargi += 1 @@ -2587,7 +2591,7 @@ this does not need to be modified.`, { name: "--no-hash-records", - help: `This is an internal parameter which normally does not need to be modified.`, + help: `See --hash-records.`, parser: func(args []string, argc int, pargi *int, options *TOptions) { types.HashRecords(false) *pargi += 1 diff --git a/internal/pkg/types/mlrmap.go b/internal/pkg/types/mlrmap.go index 683792559c..be344050c6 100644 --- a/internal/pkg/types/mlrmap.go +++ b/internal/pkg/types/mlrmap.go @@ -58,7 +58,7 @@ package types // Both these figures are for just doing mlr cat. At the moment I'm leaving this // default-on pending more profiling on more complex record-processing operations // such as mlr sort. -var hashRecords = true +var hashRecords = false func HashRecords(onOff bool) { hashRecords = onOff diff --git a/man/manpage.txt b/man/manpage.txt index 47f2a7d808..36a13164c6 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -457,6 +457,14 @@ MISCELLANEOUS FLAGS rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. + --hash-records This is an internal parameter which normally does not + need to be modified. It controls the mechanism by + which Miller accesses fields within records. In + general --no-hash-records is faster, and is the + default. For specific use-cases involving data having + many fields, and many of them being processed during + a given processing run, --hash-records might offer a + slight performance benefit. --infer-int-as-float or -A Cast all integers in data files to floats. --infer-no-octal or -O Treat numbers like 0123 in data files as string @@ -487,12 +495,16 @@ MISCELLANEOUS FLAGS unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. + --no-hash-records See --hash-records. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. + --records-per-batch {n} This is an internal parameter for maximum number of + records in a batch size. Normally this does not need + to be modified. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -2973,4 +2985,4 @@ SEE ALSO - 2021-12-07 MILLER(1) + 2021-12-15 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index cf26a26e28..d8880c3269 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-12-07 +.\" Date: 2021-12-15 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-12-07" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-12-15" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -576,6 +576,14 @@ These are flags which don't fit into any other category. rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. +--hash-records This is an internal parameter which normally does not + need to be modified. It controls the mechanism by + which Miller accesses fields within records. In + general --no-hash-records is faster, and is the + default. For specific use-cases involving data having + many fields, and many of them being processed during + a given processing run, --hash-records might offer a + slight performance benefit. --infer-int-as-float or -A Cast all integers in data files to floats. --infer-no-octal or -O Treat numbers like 0123 in data files as string @@ -606,12 +614,16 @@ These are flags which don't fit into any other category. unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. +--no-hash-records See --hash-records. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. +--records-per-batch {n} This is an internal parameter for maximum number of + records in a batch size. Normally this does not need + to be modified. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment