diff --git a/Dockerfile b/Dockerfile index a199826a8..7d6c10a23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,7 +43,7 @@ RUN mv /tmp/qed/c-deps/* c-deps/ # Build QED, Storage binary and riot RUN go build -o /usr/local/bin/qed &&\ go build -o /usr/local/bin/riot tests/riot.go &&\ - go build -o /usr/local/bin/storage tests/gossip/test_service.go + go build -o /usr/local/bin/storage testutils/notifierstore.go # Clean RUN rm -rf /var/lib/apt/lists/* /tmp/qed \ No newline at end of file diff --git a/api/apihttp/apihttp.go b/api/apihttp/apihttp.go index b76a01b79..028832921 100644 --- a/api/apihttp/apihttp.go +++ b/api/apihttp/apihttp.go @@ -253,13 +253,13 @@ func Incremental(balloon raftwal.RaftBalloonApi) http.HandlerFunc { // Wait for the response proof, err := balloon.QueryConsistency(request.Start, request.End) if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) + http.Error(w, err.Error(), http.StatusBadRequest) return } out, err := json.Marshal(protocol.ToIncrementalResponse(proof)) if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) + http.Error(w, err.Error(), http.StatusBadRequest) return } diff --git a/balloon/balloon.go b/balloon/balloon.go index 528a213e6..c0364d129 100644 --- a/balloon/balloon.go +++ b/balloon/balloon.go @@ -288,10 +288,7 @@ func (b Balloon) QueryConsistency(start, end uint64) (*IncrementalProof, error) stats.AddFloat("QueryConsistency", 1) var proof IncrementalProof - if start >= b.version || - end >= b.version || - start >= end { - + if start >= b.version || end >= b.version || start > end { return nil, errors.New("unable to process proof from history tree: invalid range") } diff --git a/client/config.go b/client/config.go index 3224e8d92..a785e35da 100644 --- a/client/config.go +++ b/client/config.go @@ -89,52 +89,56 @@ const ( // Config sets the HTTP client configuration type Config struct { + // Log level + Log string `desc:"Set log level to info, error or debug"` + // Endpoints [host:port,host:port,...] to ask for QED cluster-topology. - Endpoints []string + Endpoints []string `desc:"REST QED Log service endpoint list http://ip1:port1,http://ip2:port2... "` // ApiKey to query the server endpoint. - APIKey string + APIKey string `desc:"Set API Key to talk to QED Log service"` // Insecure enables the verification of the server's certificate chain // and host name, allowing MiTM vector attacks. - Insecure bool + Insecure bool `desc:"Set it to true to disable the verification of the server's certificate chain"` - // Timeout is the number of seconds to wait for a request to QED. - Timeout time.Duration + // Timeout is the time to wait for a request to QED. + Timeout time.Duration `desc:"Time to wait for a request to QED"` - // DialTimeout is the number of seconds to wait for the connection to be established. - DialTimeout time.Duration + // DialTimeout is the time to wait for the connection to be established. + DialTimeout time.Duration `desc:"Time to wait for the connection to be established"` - // HandshakeTimeout is the number of seconds to wait for a handshake negotiation. - HandshakeTimeout time.Duration + // HandshakeTimeout is the time to wait for a handshake negotiation. + HandshakeTimeout time.Duration `desc:"Time to wait for a handshake negotiation"` // Controls how the client will route all queries to members of the cluster. - ReadPreference ReadPref + ReadPreference ReadPref `flag:"-"` // MaxRetries sets the maximum number of retries before giving up // when performing an HTTP request to QED. - MaxRetries int + MaxRetries int `desc:"Sets the maximum number of retries before giving up"` // EnableTopologyDiscovery enables the process of discovering the cluster // topology when requests fail. - EnableTopologyDiscovery bool + EnableTopologyDiscovery bool `desc:"Enables the process of discovering the cluster topology when requests fail"` // EnableHealthChecks enables helthchecks of all endpoints in the current cluster topology. - EnableHealthChecks bool + EnableHealthChecks bool `desc:"Enables helthchecks of all endpoints in the current cluster topology"` - // HealthCheckTimeout is the timeout in seconds the healthcheck waits for a response + // HealthCheckTimeout is the time the healthcheck waits for a response // from a QED server. - HealthCheckTimeout time.Duration + + HealthCheckTimeout time.Duration `desc:"Time the healthcheck waits for a response from QED"` // AttemptToReviveEndpoints sets if dead endpoints will be marked alive again after a // round-robin round. This way, they will be picked up in the next try. - AttemptToReviveEndpoints bool + AttemptToReviveEndpoints bool `desc:"Set if dead endpoints will be marked alive again after a round-robin round"` } // DefaultConfig creates a Config structures with default values. func DefaultConfig() *Config { return &Config{ - Endpoints: []string{"127.0.0.1:8800"}, + Endpoints: []string{"http://127.0.0.1:8800"}, APIKey: "my-key", Insecure: DefaultInsecure, Timeout: DefaultTimeout, diff --git a/cmd/agent.go b/cmd/agent.go index 34538c379..6eeb5e62a 100644 --- a/cmd/agent.go +++ b/cmd/agent.go @@ -17,76 +17,45 @@ package cmd import ( - "regexp" - - "github.com/spf13/cobra" - v "github.com/spf13/viper" + "context" "github.com/bbva/qed/gossip" + "github.com/bbva/qed/log" + "github.com/octago/sflags/gen/gpflag" + "github.com/spf13/cobra" ) -func newAgentCommand(cmdCtx *cmdContext, args []string) *cobra.Command { - - config := gossip.DefaultConfig() - - cmd := &cobra.Command{ - Use: "agent", - Short: "Start a gossip agent for the verifiable log QED", - } - - f := cmd.PersistentFlags() - f.StringVar(&config.NodeName, "node", "", "Unique name for node. If not set, fallback to hostname") - f.StringVar(&config.BindAddr, "bind", "", "Bind address for TCP/UDP gossip on (host:port)") - f.StringVar(&config.AdvertiseAddr, "advertise", "", "Address to advertise to cluster") - f.StringVar(&config.MetricsAddr, "metrics", "", "Address to bind metrics endpoint") - f.StringSliceVar(&config.StartJoin, "join", []string{}, "Comma-delimited list of nodes ([host]:port), through which a cluster can be joined") - f.StringSliceVar(&config.AlertsUrls, "alertsUrls", []string{}, "Comma-delimited list of Alert servers ([host]:port), through which an agent can post alerts") - - // Lookups - v.BindPFlag("agent.node", f.Lookup("node")) - v.BindPFlag("agent.bind", f.Lookup("bind")) - v.BindPFlag("agent.advertise", f.Lookup("advertise")) - v.BindPFlag("agent.metrics", f.Lookup("metrics")) - v.BindPFlag("agent.join", f.Lookup("join")) - v.BindPFlag("agent.alerts_urls", f.Lookup("alertsUrls")) - - agentPreRun := func(config gossip.Config) gossip.Config { - config.EnableCompression = true - config.NodeName = v.GetString("agent.node") - config.BindAddr = v.GetString("agent.bind") - config.AdvertiseAddr = v.GetString("agent.advertise") - config.MetricsAddr = v.GetString("agent.metrics") - config.StartJoin = v.GetStringSlice("agent.join") - config.AlertsUrls = v.GetStringSlice("agent.alerts_urls") - - markStringRequired(config.NodeName, "node") - markStringRequired(config.BindAddr, "bind") - markSliceStringRequired(config.StartJoin, "join") - markSliceStringRequired(config.AlertsUrls, "alertsUrls") - - return config - } - - var kind string - re := regexp.MustCompile("^monitor$|^auditor$|^publisher$") - for _, arg := range args { - if re.MatchString(arg) { - kind = arg - break - } - } +var agentCmd *cobra.Command = &cobra.Command{ + Use: "agent", + Short: "Provides access to the QED gossip agents", + Long: `QED provides standalone agents to help maintain QED security. We have included +three agents into the distribution: + * Monitor agent: checks the lag of the system between the QED Log and the + Snapshot Store as seen by the gossip network + * Auditor agent: verifies QED membership proofs of the snapshots received + throught the gossip network + * Publisher agent: publish snapshots to the snapshot store`, + TraverseChildren: true, +} - switch kind { - case "publisher": - cmd.AddCommand(newAgentPublisherCommand(cmdCtx, *config, agentPreRun)) +var agentCtx context.Context = configAgent() - case "auditor": - cmd.AddCommand(newAgentAuditorCommand(cmdCtx, *config, agentPreRun)) +func init() { + agentCmd.MarkFlagRequired("bind-addr") + agentCmd.MarkFlagRequired("metrics-addr") + agentCmd.MarkFlagRequired("node-name") + agentCmd.MarkFlagRequired("role") + agentCmd.MarkFlagRequired("log") + Root.AddCommand(agentCmd) +} - case "monitor": - cmd.AddCommand(newAgentMonitorCommand(cmdCtx, *config, agentPreRun)) +func configAgent() context.Context { + conf := gossip.DefaultConfig() + err := gpflag.ParseTo(conf, agentCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) } - return cmd - + return context.WithValue(Ctx, k("agent.config"), conf) } + diff --git a/cmd/agent_auditor.go b/cmd/agent_auditor.go index e6d8445f2..659708df6 100644 --- a/cmd/agent_auditor.go +++ b/cmd/agent_auditor.go @@ -3,7 +3,9 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,74 +16,182 @@ package cmd import ( - "github.com/spf13/cobra" - v "github.com/spf13/viper" + "context" + "fmt" + "github.com/bbva/qed/client" "github.com/bbva/qed/gossip" - "github.com/bbva/qed/gossip/auditor" - "github.com/bbva/qed/gossip/member" + "github.com/bbva/qed/hashing" "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" + "github.com/bbva/qed/protocol" "github.com/bbva/qed/util" + "github.com/octago/sflags/gen/gpflag" + "github.com/prometheus/client_golang/prometheus" + "github.com/spf13/cobra" ) -func newAgentAuditorCommand(ctx *cmdContext, config gossip.Config, agentPreRun func(gossip.Config) gossip.Config) *cobra.Command { +var ( + QedAuditorInstancesCount = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "qed_auditor_instances_count", + Help: "Number of auditor agents running.", + }, + ) - auditorConfig := auditor.DefaultConfig() + QedAuditorBatchesProcessSeconds = prometheus.NewSummary( + prometheus.SummaryOpts{ + Name: "qed_auditor_batches_process_seconds", + Help: "Duration of Auditor batch processing", + }, + ) - cmd := &cobra.Command{ - Use: "auditor", - Short: "Start a QED auditor", - Long: `Start a QED auditor that reacts to snapshot batches propagated by QED servers and periodically executes membership queries to verify the inclusion of events`, - PreRun: func(cmd *cobra.Command, args []string) { + QedAuditorBatchesReceivedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_auditor_batches_received_total", + Help: "Number of batches received by auditors.", + }, + ) - log.SetLogger("QEDAuditor", ctx.logLevel) + QedAuditorGetMembershipProofErrTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_auditor_get_membership_proof_err_total", + Help: "Number of errors trying to get membership proofs by auditors.", + }, + ) +) - // WARN: PersitentPreRun can't be nested and we're using it in cmd/root so inbetween preRuns - // must be curried. - config = agentPreRun(config) +var agentAuditorCmd = &cobra.Command{ + Use: "auditor", + Short: "Provides access to the QED gossip auditor agent", + Long: `Start a QED auditor that reacts to snapshot batches propagated +by QED servers and periodically executes membership queries to verify +the inclusion of events`, + RunE: runAgentAuditor, +} - auditorConfig.QEDUrls = v.GetStringSlice("agent.server_urls") - auditorConfig.PubUrls = v.GetStringSlice("agent.snapshots_store_urls") - auditorConfig.AlertsUrls = v.GetStringSlice("agent.alerts_urls") +var agentAuditorCtx context.Context - markSliceStringRequired(auditorConfig.QEDUrls, "qedUrls") - markSliceStringRequired(auditorConfig.PubUrls, "pubUrls") - markSliceStringRequired(auditorConfig.AlertsUrls, "alertsUrls") - }, - Run: func(cmd *cobra.Command, args []string) { +func init() { + agentAuditorCtx = configAuditor() + agentCmd.AddCommand(agentAuditorCmd) +} - config.Role = member.Auditor - auditorConfig.APIKey = ctx.apiKey +type auditorConfig struct { + Qed *client.Config + Notifier *gossip.SimpleNotifierConfig + Store *gossip.RestSnapshotStoreConfig + Tasks *gossip.SimpleTasksManagerConfig +} - auditor, err := auditor.NewAuditor(*auditorConfig) - if err != nil { - log.Fatalf("Failed to start the QED monitor: %v", err) - } - metricsServer := metrics.NewServer(config.MetricsAddr) - agent, err := gossip.NewAgent(&config, []gossip.Processor{auditor}, metricsServer) - if err != nil { - log.Fatalf("Failed to start the QED auditor: %v", err) - } +func newAuditorConfig() *auditorConfig { + return &auditorConfig{ + Qed: client.DefaultConfig(), + Notifier: gossip.DefaultSimpleNotifierConfig(), + Store: gossip.DefaultRestSnapshotStoreConfig(), + Tasks: gossip.DefaultSimpleTasksManagerConfig(), + } +} - contacted, err := agent.Join(config.StartJoin) - if err != nil { - log.Fatalf("Failed to join the cluster: %v", err) - } - log.Debugf("Number of nodes contacted: %d (%v)", contacted, config.StartJoin) +func configAuditor() context.Context { + conf := newAuditorConfig() + err := gpflag.ParseTo(conf, agentAuditorCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) + } - defer agent.Shutdown() - util.AwaitTermSignal(agent.Leave) - }, + ctx := context.WithValue(agentCtx, k("auditor.config"), conf) + + return ctx +} + +func runAgentAuditor(cmd *cobra.Command, args []string) error { + agentConfig := agentAuditorCtx.Value(k("agent.config")).(*gossip.Config) + conf := agentAuditorCtx.Value(k("auditor.config")).(*auditorConfig) + + log.SetLogger("auditor", agentConfig.Log) + + notifier := gossip.NewSimpleNotifierFromConfig(conf.Notifier) + qed, err := client.NewHTTPClientFromConfig(conf.Qed) + if err != nil { + return err + } + tm := gossip.NewSimpleTasksManagerFromConfig(conf.Tasks) + store := gossip.NewRestSnapshotStoreFromConfig(conf.Store) + + agent, err := gossip.NewDefaultAgent(agentConfig, qed, store, tm, notifier) + if err != nil { + return err + } + + bp := gossip.NewBatchProcessor(agent, []gossip.TaskFactory{gossip.PrinterFactory{}, membershipFactory{}}) + agent.In.Subscribe(gossip.BatchMessageType, bp, 255) + defer bp.Stop() + + agent.Start() + + QedAuditorInstancesCount.Inc() + + util.AwaitTermSignal(agent.Shutdown) + return nil +} + +type membershipFactory struct{} + +func (m membershipFactory) Metrics() []prometheus.Collector { + return []prometheus.Collector{ + QedAuditorInstancesCount, + QedAuditorBatchesProcessSeconds, + QedAuditorBatchesReceivedTotal, + QedAuditorGetMembershipProofErrTotal, } +} - f := cmd.Flags() - f.StringSliceVarP(&auditorConfig.QEDUrls, "qedUrls", "", []string{}, "Comma-delimited list of QED servers ([host]:port), through which an auditor can make queries") - f.StringSliceVarP(&auditorConfig.PubUrls, "pubUrls", "", []string{}, "Comma-delimited list of store servers ([host]:port), through which an auditor can make queries") - f.StringSliceVarP(&auditorConfig.AlertsUrls, "alertsUrls", "", []string{}, "Comma-delimited list of alerts servers ([host]:port), through which an auditor can make queries") - // Lookups - v.BindPFlag("agent.server_urls", f.Lookup("qedUrls")) - v.BindPFlag("agent.snapshots_store_urls", f.Lookup("pubUrls")) - v.BindPFlag("agent.alerts_urls", f.Lookup("alertsUrls")) - return cmd +func (i membershipFactory) New(ctx context.Context) gossip.Task { + a := ctx.Value("agent").(*gossip.Agent) + b := ctx.Value("batch").(*protocol.BatchSnapshots) + + s := b.Snapshots[0] + + QedAuditorBatchesReceivedTotal.Inc() + + return func() error { + timer := prometheus.NewTimer(QedAuditorBatchesProcessSeconds) + defer timer.ObserveDuration() + + proof, err := a.Qed.MembershipDigest(s.Snapshot.EventDigest, s.Snapshot.Version) + if err != nil { + log.Infof("Auditor is unable to get membership proof from QED server: %v", err) + + switch fmt.Sprintf("%T", err) { + case "*errors.errorString": + a.Notifier.Alert(fmt.Sprintf("Auditor is unable to get membership proof from QED server: %v", err)) + default: + QedAuditorGetMembershipProofErrTotal.Inc() + } + + return err + } + + storedSnap, err := a.SnapshotStore.GetSnapshot(proof.CurrentVersion) + if err != nil { + log.Infof("Unable to get snapshot from storage: %v", err) + return err + } + + checkSnap := &protocol.Snapshot{ + HistoryDigest: s.Snapshot.HistoryDigest, + HyperDigest: storedSnap.Snapshot.HyperDigest, + Version: s.Snapshot.Version, + EventDigest: s.Snapshot.EventDigest, + } + + ok := a.Qed.DigestVerify(proof, checkSnap, hashing.NewSha256Hasher) + if !ok { + a.Notifier.Alert(fmt.Sprintf("Unable to verify snapshot %v", s.Snapshot)) + log.Infof("Unable to verify snapshot %v", s.Snapshot) + } + + log.Infof("MembershipTask.Do(): Snapshot %v has been verified by QED", s.Snapshot) + return nil + } } diff --git a/cmd/agent_monitor.go b/cmd/agent_monitor.go index fda4db0f6..84b55c06a 100644 --- a/cmd/agent_monitor.go +++ b/cmd/agent_monitor.go @@ -3,7 +3,9 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,75 +16,256 @@ package cmd import ( - "github.com/spf13/cobra" - v "github.com/spf13/viper" + "context" + "fmt" + "sync/atomic" + "time" + "github.com/bbva/qed/client" "github.com/bbva/qed/gossip" - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/gossip/monitor" + "github.com/bbva/qed/hashing" "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" + "github.com/bbva/qed/protocol" "github.com/bbva/qed/util" + "github.com/octago/sflags/gen/gpflag" + "github.com/prometheus/client_golang/prometheus" + "github.com/spf13/cobra" ) -func newAgentMonitorCommand(ctx *cmdContext, config gossip.Config, agentPreRun func(gossip.Config) gossip.Config) *cobra.Command { +var ( + QedMonitorInstancesCount = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "qed_monitor_instances_count", + Help: "Number of monitor agents running.", + }, + ) - monitorConfig := monitor.DefaultConfig() + QedMonitorBatchesReceivedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_monitor_batches_received_total", + Help: "Number of batches received by monitors.", + }, + ) - cmd := &cobra.Command{ - Use: "monitor", - Short: "Start a QED monitor", - Long: `Start a QED monitor that reacts to snapshot batches propagated by QED servers and periodically executes incremental queries to verify the consistency between snaphots`, - PreRun: func(cmd *cobra.Command, args []string) { + QedMonitorBatchesProcessSeconds = prometheus.NewSummary( + prometheus.SummaryOpts{ + Name: "qed_monitor_batches_process_seconds", + Help: "Duration of Monitor batch processing", + }, + ) - log.SetLogger("QEDMonitor", ctx.logLevel) + QedMonitorGetIncrementalProofErrTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_monitor_get_incremental_proof_err_total", + Help: "Number of errors trying to get incremental proofs by monitors.", + }, + ) +) - // WARN: PersitentPreRun can't be nested and we're using it in cmd/root so inbetween preRuns - // must be curried. - config = agentPreRun(config) +var agentMonitorCmd *cobra.Command = &cobra.Command{ + Use: "monitor", + Short: "Provides access to the QED gossip monitor agent", + Long: `Stats a QED monitor which process gossip messages measuring +the lag between the gossip received messages and the contents of the +snapshotsore. It also executes incremental proof verification against +some of the snapshots received.`, + TraverseChildren: true, + RunE: runAgentMonitor, +} - // Bindings - monitorConfig.MetricsAddr = config.BindAddr // TODO: make MetricsAddr configurable - monitorConfig.QEDUrls = v.GetStringSlice("agent.server_urls") - monitorConfig.AlertsUrls = v.GetStringSlice("agent.alerts_urls") +var agentMonitorCtx context.Context - markSliceStringRequired(monitorConfig.QEDUrls, "qedUrls") - markSliceStringRequired(monitorConfig.AlertsUrls, "alertsUrls") - }, - Run: func(cmd *cobra.Command, args []string) { +func init() { + agentMonitorCtx = configMonitor() + agentCmd.AddCommand(agentMonitorCmd) +} - config.Role = member.Monitor - monitorConfig.APIKey = ctx.apiKey +type monitorConfig struct { + Qed *client.Config + Notifier *gossip.SimpleNotifierConfig + Store *gossip.RestSnapshotStoreConfig + Tasks *gossip.SimpleTasksManagerConfig +} - monitor, err := monitor.NewMonitor(monitorConfig) - if err != nil { - log.Fatalf("Failed to start the QED monitor: %v", err) - } - metricsServer := metrics.NewServer(config.MetricsAddr) - agent, err := gossip.NewAgent(&config, []gossip.Processor{monitor}, metricsServer) - if err != nil { - log.Fatalf("Failed to start the QED monitor: %v", err) - } - defer agent.Shutdown() +func newMonitorConfig() *monitorConfig { + return &monitorConfig{ + Qed: client.DefaultConfig(), + Notifier: gossip.DefaultSimpleNotifierConfig(), + Store: gossip.DefaultRestSnapshotStoreConfig(), + Tasks: gossip.DefaultSimpleTasksManagerConfig(), + } +} - contacted, err := agent.Join(config.StartJoin) - if err != nil { - log.Fatalf("Failed to join the cluster: %v", err) - } +func configMonitor() context.Context { + conf := newMonitorConfig() + err := gpflag.ParseTo(conf, agentMonitorCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) + } - log.Debugf("Number of nodes contacted: %d", contacted) + ctx := context.WithValue(agentCtx, k("monitor.config"), conf) - util.AwaitTermSignal(agent.Leave) - }, + return ctx +} + +func runAgentMonitor(cmd *cobra.Command, args []string) error { + agentConfig := agentMonitorCtx.Value(k("agent.config")).(*gossip.Config) + conf := agentMonitorCtx.Value(k("monitor.config")).(*monitorConfig) + + log.SetLogger("monitor", agentConfig.Log) + + notifier := gossip.NewSimpleNotifierFromConfig(conf.Notifier) + conf.Qed.AttemptToReviveEndpoints = true + conf.Qed.ReadPreference = client.Any + qed, err := client.NewHTTPClientFromConfig(conf.Qed) + if err != nil { + return err + } + tm := gossip.NewSimpleTasksManagerFromConfig(conf.Tasks) + store := gossip.NewRestSnapshotStoreFromConfig(conf.Store) + + agent, err := gossip.NewDefaultAgent(agentConfig, qed, store, tm, notifier) + if err != nil { + return err } - f := cmd.Flags() - f.StringSliceVarP(&monitorConfig.QEDUrls, "qedUrls", "", []string{}, "Comma-delimited list of QED servers ([host]:port), through which a monitor can make queries") - f.StringSliceVarP(&monitorConfig.AlertsUrls, "alertsUrls", "", []string{}, "Comma-delimited list of QED servers ([host]:port), through which an monitor can publish alerts") + lagf := newLagFactory(1 * time.Second) + lagf.start() + defer lagf.stop() + bp := gossip.NewBatchProcessor(agent, []gossip.TaskFactory{gossip.PrinterFactory{}, incrementalFactory{}, lagf}) + agent.In.Subscribe(gossip.BatchMessageType, bp, 255) + defer bp.Stop() + + agent.Start() - // Lookups - v.BindPFlag("agent.server_urls", f.Lookup("qedUrls")) - v.BindPFlag("agent.alerts_urls", f.Lookup("alertsUrls")) + QedMonitorInstancesCount.Inc() - return cmd + util.AwaitTermSignal(agent.Shutdown) + return nil +} + +type incrementalFactory struct{} + +func (i incrementalFactory) Metrics() []prometheus.Collector { + return []prometheus.Collector{ + QedMonitorInstancesCount, + QedMonitorBatchesReceivedTotal, + QedMonitorBatchesProcessSeconds, + QedMonitorGetIncrementalProofErrTotal, + } +} + +func (i incrementalFactory) New(ctx context.Context) gossip.Task { + a := ctx.Value("agent").(*gossip.Agent) + b := ctx.Value("batch").(*protocol.BatchSnapshots) + + return func() error { + timer := prometheus.NewTimer(QedMonitorBatchesProcessSeconds) + defer timer.ObserveDuration() + + first := b.Snapshots[0].Snapshot + last := b.Snapshots[len(b.Snapshots)-1].Snapshot + + resp, err := a.Qed.Incremental(first.Version, last.Version) + if err != nil { + QedMonitorGetIncrementalProofErrTotal.Inc() + a.Notifier.Alert(fmt.Sprintf("Monitor is unable to get incremental proof from QED server: %s", err.Error())) + log.Infof("Monitor is unable to get incremental proof from QED server: %s", err.Error()) + return err + } + ok := a.Qed.VerifyIncremental(resp, first, last, hashing.NewSha256Hasher()) + if !ok { + a.Notifier.Alert(fmt.Sprintf("Monitor is unable to verify incremental proof from %d to %d", first.Version, last.Version)) + log.Infof("Monitor is unable to verify incremental proof from %d to %d", first.Version, last.Version) + } + log.Debugf("Monitor verified a consistency proof between versions %d and %d: %v\n", first.Version, last.Version, ok) + return nil + } +} + +type lagFactory struct { + lastVersion uint64 + rate uint64 + counter uint64 + ticker *time.Ticker + quit chan struct{} +} + +func newLagFactory(t time.Duration) *lagFactory { + return &lagFactory{ + ticker: time.NewTicker(t), + quit: make(chan struct{}), + } +} + +func (l *lagFactory) stop() { + close(l.quit) +} + +func (l *lagFactory) start() { + go func() { + for { + select { + case <-l.ticker.C: + c := atomic.SwapUint64(&l.counter, 0) + atomic.StoreUint64(&l.rate, c) + case <-l.quit: + l.ticker.Stop() + return + } + } + }() +} + +func (l lagFactory) Metrics() []prometheus.Collector { + return []prometheus.Collector{} +} + +func (l *lagFactory) New(ctx context.Context) gossip.Task { + a := ctx.Value("agent").(*gossip.Agent) + b := ctx.Value("batch").(*protocol.BatchSnapshots) + + counter := atomic.AddUint64(&l.counter, uint64(len(b.Snapshots))) + lastVersion := atomic.LoadUint64(&l.lastVersion) + + QedMonitorBatchesReceivedTotal.Inc() + + return func() error { + timer := prometheus.NewTimer(QedMonitorBatchesProcessSeconds) + defer timer.ObserveDuration() + + last := b.Snapshots[len(b.Snapshots)-1].Snapshot + localLag := uint64(0) + + if lastVersion < last.Version { + localLag = last.Version - lastVersion + atomic.StoreUint64(&l.lastVersion, last.Version) + } + + rate := atomic.LoadUint64(&l.rate) + + if localLag > rate { + log.Infof("Gossip lag %d > Rate %d", localLag, rate) + } + + count, err := a.SnapshotStore.Count() + if err != nil { + return err + } + + storeLag := uint64(0) + if lastVersion > count { + storeLag = lastVersion - count + } + + if storeLag > rate { + err := a.Notifier.Alert(fmt.Sprintf("Lag between gossip and snapshot store: %d", storeLag)) + if err != nil { + log.Infof("LagTask had an error sending a notification: %v", err) + } + log.Infof("Lag between gossip and snapshot store: last seen version %d - store count %d = %d", lastVersion, count, storeLag) + } + log.Infof("Lag status: Rate: %d Counter: %d, Local Lag: %d Store Lag: %d", rate, counter, localLag, storeLag) + return nil + } } diff --git a/cmd/agent_publisher.go b/cmd/agent_publisher.go index e2c40680f..bfbfc790b 100644 --- a/cmd/agent_publisher.go +++ b/cmd/agent_publisher.go @@ -3,7 +3,9 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,73 +16,147 @@ package cmd import ( - "github.com/spf13/cobra" - v "github.com/spf13/viper" + "context" + "fmt" "github.com/bbva/qed/gossip" - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/gossip/publisher" "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" + "github.com/bbva/qed/protocol" "github.com/bbva/qed/util" + "github.com/octago/sflags/gen/gpflag" + "github.com/prometheus/client_golang/prometheus" + "github.com/spf13/cobra" ) -func newAgentPublisherCommand(ctx *cmdContext, config gossip.Config, agentPreRun func(gossip.Config) gossip.Config) *cobra.Command { +var ( + QedPublisherInstancesCount = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "qed_publisher_instances_count", + Help: "Number of publisher agents running.", + }, + ) - publisherConfig := publisher.DefaultConfig() + QedPublisherBatchesReceivedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_publisher_batches_received_total", + Help: "Number of batches received by publishers.", + }, + ) - cmd := &cobra.Command{ - Use: "publisher", - Short: "Start a QED publisher", - Long: `Start a QED publisher that reacts to snapshot batches propagated by QED servers and periodically publishes them to a certain log storage.`, - PreRun: func(cmd *cobra.Command, args []string) { + QedPublisherBatchesProcessSeconds = prometheus.NewSummary( + prometheus.SummaryOpts{ + Name: "qed_publisher_batches_process_seconds", + Help: "Duration of Publisher batch processing", + }, + ) +) - log.SetLogger("QEDPublisher", ctx.logLevel) +var agentPublisherCmd *cobra.Command = &cobra.Command{ + Use: "publisher", + Short: "Provides access to the QED gossip publisher agent", + Long: `Start a QED publisher which process gossip messages sending batch +messages contents to the snapshot storage.`, + RunE: runAgentPublisher, +} - // WARN: PersitentPreRun can't be nested and we're using it in - // cmd/root so inbetween preRuns must be curried. - config = agentPreRun(config) +var agentPublisherCtx context.Context - // Bindings - publisherConfig.MetricsAddr = config.BindAddr // TODO: make MetricsAddr configurable - publisherConfig.PubUrls = v.GetStringSlice("agent.snapshots_store_urls") - publisherConfig.AlertsUrls = v.GetStringSlice("agent.alerts_urls") +func init() { + agentPublisherCtx = configPublisher() + agentPublisherCmd.MarkFlagRequired("notifier-servers") + agentPublisherCmd.MarkFlagRequired("store-servers") + agentCmd.AddCommand(agentPublisherCmd) +} - markSliceStringRequired(publisherConfig.PubUrls, "pubUrls") - markSliceStringRequired(publisherConfig.AlertsUrls, "alertsUrls") - }, - Run: func(cmd *cobra.Command, args []string) { +type publisherConfig struct { + Notifier *gossip.SimpleNotifierConfig + Store *gossip.RestSnapshotStoreConfig + Tasks *gossip.SimpleTasksManagerConfig +} - config.Role = member.Publisher +func newPublisherConfig() *publisherConfig { + return &publisherConfig{ + Notifier: gossip.DefaultSimpleNotifierConfig(), + Store: gossip.DefaultRestSnapshotStoreConfig(), + Tasks: gossip.DefaultSimpleTasksManagerConfig(), + } +} - publisher, err := publisher.NewPublisher(*publisherConfig) - if err != nil { - log.Fatalf("Failed to start the QED publisher: %v", err) - } - metricsServer := metrics.NewServer(config.MetricsAddr) - agent, err := gossip.NewAgent(&config, []gossip.Processor{publisher}, metricsServer) - if err != nil { - log.Fatalf("Failed to start the QED publisher: %v", err) - } +func configPublisher() context.Context { + conf := newPublisherConfig() + err := gpflag.ParseTo(conf, agentPublisherCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) + } - contacted, err := agent.Join(config.StartJoin) - if err != nil { - log.Fatalf("Failed to join the cluster: %v", err) - } - log.Debugf("Number of nodes contacted: %d", contacted) + ctx := context.WithValue(agentCtx, k("publisher.config"), conf) - defer agent.Shutdown() - util.AwaitTermSignal(agent.Leave) - }, + return ctx +} + +func runAgentPublisher(cmd *cobra.Command, args []string) error { + agentConfig := agentCtx.Value(k("agent.config")).(*gossip.Config) + conf := agentPublisherCtx.Value(k("publisher.config")).(*publisherConfig) + + log.SetLogger("publisher", agentConfig.Log) + + notifier := gossip.NewSimpleNotifierFromConfig(conf.Notifier) + tm := gossip.NewSimpleTasksManagerFromConfig(conf.Tasks) + store := gossip.NewRestSnapshotStoreFromConfig(conf.Store) + + agent, err := gossip.NewDefaultAgent(agentConfig, nil, store, tm, notifier) + if err != nil { + return err + } + + bp := gossip.NewBatchProcessor(agent, []gossip.TaskFactory{gossip.PrinterFactory{}, publisherFactory{}}) + agent.In.Subscribe(gossip.BatchMessageType, bp, 255) + defer bp.Stop() + + agent.Start() + util.AwaitTermSignal(agent.Shutdown) + return nil +} + +type publisherFactory struct { +} + +func (p publisherFactory) Metrics() []prometheus.Collector { + QedPublisherInstancesCount.Inc() + return []prometheus.Collector{ + QedPublisherInstancesCount, + QedPublisherBatchesReceivedTotal, + QedPublisherBatchesProcessSeconds, } +} - f := cmd.Flags() - f.StringSliceVarP(&publisherConfig.PubUrls, "pubUrls", "", []string{}, "Comma-delimited list of end-publishers ([host]:port), through which an publisher can send requests") - f.StringSliceVarP(&publisherConfig.AlertsUrls, "alertsUrls", "", []string{}, "Comma-delimited list of QED servers ([host]:port), through which an monitor can publish alerts") +var errorNoSnapshots error = fmt.Errorf("No snapshots were found on this batch!!") - // Lookups - v.BindPFlag("agent.snapshots_store_urls", f.Lookup("pubUrls")) - v.BindPFlag("agent.alerts_urls", f.Lookup("alertsUrls")) +func (p publisherFactory) New(ctx context.Context) gossip.Task { + QedPublisherBatchesReceivedTotal.Inc() + fmt.Println("PublisherFactory creating new Task!") + a := ctx.Value("agent").(*gossip.Agent) + b := ctx.Value("batch").(*protocol.BatchSnapshots) - return cmd + return func() error { + timer := prometheus.NewTimer(QedPublisherBatchesProcessSeconds) + defer timer.ObserveDuration() + + batch := new(protocol.BatchSnapshots) + batch.Snapshots = make([]*protocol.SignedSnapshot, 0) + for _, signedSnap := range b.Snapshots { + _, err := a.Cache.Get(signedSnap.Signature) + if err != nil { + log.Debugf("PublishingTask: add snapshot to be published") + a.Cache.Set(signedSnap.Signature, []byte{0x0}, 0) + batch.Snapshots = append(batch.Snapshots, signedSnap) + } + } + + if len(batch.Snapshots) < 1 { + return errorNoSnapshots + } + log.Debugf("Sending batch to snapshot store: %+v", batch) + return a.SnapshotStore.PutBatch(batch) + } } diff --git a/cmd/client.go b/cmd/client.go index f0d36a6c3..d5a6fff92 100644 --- a/cmd/client.go +++ b/cmd/client.go @@ -17,67 +17,35 @@ package cmd import ( - "fmt" - "time" + "context" + "github.com/octago/sflags/gen/gpflag" "github.com/spf13/cobra" - v "github.com/spf13/viper" "github.com/bbva/qed/client" "github.com/bbva/qed/log" ) -func newClientCommand(ctx *cmdContext) *cobra.Command { - clientCtx := &clientContext{} - clientCtx.config = client.DefaultConfig() - - cmd := &cobra.Command{ - Use: "client", - Short: "Client mode for qed", - Long: `Client process for emitting events to a qed server`, - } - - f := cmd.PersistentFlags() - f.StringSliceVarP(&clientCtx.config.Endpoints, "endpoints", "e", []string{"127.0.0.1:8800"}, "Endpoint for REST requests on (host:port)") - f.BoolVar(&clientCtx.config.Insecure, "insecure", false, "Allow self signed certificates") - f.DurationVar(&clientCtx.config.Timeout, "timeout-seconds", 10*time.Second, "Seconds to cut the connection") - f.DurationVar(&clientCtx.config.DialTimeout, "dial-timeout-seconds", 5*time.Second, "Seconds to cut the dialing") - f.DurationVar(&clientCtx.config.HandshakeTimeout, "handshake-timeout-seconds", 5*time.Second, "Seconds to cut the handshaking") +var clientCmd *cobra.Command = &cobra.Command{ + Use: "client", + Short: "Provdes access to the QED log client", + TraverseChildren: true, +} - // Lookups - v.BindPFlag("client.endpoints", f.Lookup("endpoints")) - v.BindPFlag("client.insecure", f.Lookup("insecure")) - v.BindPFlag("client.timeout.connection", f.Lookup("timeout-seconds")) - v.BindPFlag("client.timeout.dial", f.Lookup("dial-timeout-seconds")) - v.BindPFlag("client.timeout.handshake", f.Lookup("handshake-timeout-seconds")) +var clientCtx context.Context = configClient() - clientPreRun := func(cmd *cobra.Command, args []string) { +func init() { + Root.AddCommand(clientCmd) +} - log.SetLogger("QEDClient", ctx.logLevel) +func configClient() context.Context { - clientCtx.config.APIKey = ctx.apiKey - clientCtx.config.Endpoints = v.GetStringSlice("client.endpoints") - clientCtx.config.Insecure = v.GetBool("client.insecure") - clientCtx.config.Timeout = v.GetDuration("client.timeout.connection") - clientCtx.config.DialTimeout = v.GetDuration("client.timeout.dial") - clientCtx.config.HandshakeTimeout = v.GetDuration("client.timeout.handshake") - clientCtx.config.ReadPreference = client.Any - clientCtx.config.EnableTopologyDiscovery = false - clientCtx.config.EnableHealthChecks = false - clientCtx.config.MaxRetries = 0 + conf := client.DefaultConfig() - client, err := client.NewHTTPClientFromConfig(clientCtx.config) - if err != nil { - panic(fmt.Sprintf("Unable to start http client: %v", err)) - } - clientCtx.client = client + err := gpflag.ParseTo(conf, clientCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) } - cmd.AddCommand( - newAddCommand(clientCtx, clientPreRun), - newMembershipCommand(clientCtx, clientPreRun), - newIncrementalCommand(clientCtx, clientPreRun), - ) - - return cmd + return context.WithValue(Ctx, k("client.config"), conf) } diff --git a/cmd/client_add.go b/cmd/client_add.go index d7cf7d884..f46faaa4a 100644 --- a/cmd/client_add.go +++ b/cmd/client_add.go @@ -19,43 +19,54 @@ package cmd import ( "fmt" + "github.com/bbva/qed/client" + "github.com/bbva/qed/log" "github.com/spf13/cobra" ) -func newAddCommand(ctx *clientContext, clientPreRun func(*cobra.Command, []string)) *cobra.Command { - - var key string - - cmd := &cobra.Command{ - Use: "add", - Short: "Add an event", - Long: `Add an event to the authenticated data structure`, - PreRun: func(cmd *cobra.Command, args []string) { - // WARN: PersitentPreRun can't be nested and we're using it in - // cmd/root so inbetween preRuns must be curried. - clientPreRun(cmd, args) - }, - RunE: func(cmd *cobra.Command, args []string) error { - fmt.Printf("\nAdding key [ %s ]\n", key) - // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 - cmd.SilenceUsage = true - snapshot, err := ctx.client.Add(key) - if err != nil { - return err - } - - fmt.Printf("\nReceived snapshot with values:\n\n") - fmt.Printf(" EventDigest: %x\n", snapshot.EventDigest) - fmt.Printf(" HyperDigest: %x\n", snapshot.HyperDigest) - fmt.Printf(" HistoryDigest: %x\n", snapshot.HistoryDigest) - fmt.Printf(" Version: %d\n\n", snapshot.Version) - - return nil - }, +var clientAddCmd *cobra.Command = &cobra.Command{ + Use: "add", + Short: "Add a QED event to the QED log", + RunE: runClientAdd, +} + +var clientAddEvent string + +func init() { + + clientAddCmd.Flags().StringVar(&clientAddEvent, "event", "", "Event to append to QED") + clientAddCmd.MarkFlagRequired("event") + + clientCmd.AddCommand(clientAddCmd) +} + +func runClientAdd(cmd *cobra.Command, args []string) error { + // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 + if clientAddEvent == "" { + return fmt.Errorf("Event must not be empty!") } - cmd.Flags().StringVar(&key, "key", "", "Key to add") - cmd.MarkFlagRequired("key") + cmd.SilenceUsage = true - return cmd + config := clientCtx.Value(k("client.config")).(*client.Config) + log.SetLogger("client", config.Log) + + client, err := client.NewHTTPClientFromConfig(config) + if err != nil { + return err + } + + snapshot, err := client.Add(clientAddEvent) + if err != nil { + return err + } + + fmt.Printf("\nReceived snapshot with values:\n\n") + fmt.Printf(" EventDigest: %x\n", snapshot.EventDigest) + fmt.Printf(" HyperDigest: %x\n", snapshot.HyperDigest) + fmt.Printf(" HistoryDigest: %x\n", snapshot.HistoryDigest) + fmt.Printf(" Version: %d\n\n", snapshot.Version) + + return nil } + diff --git a/cmd/client_incremental.go b/cmd/client_incremental.go index fbbb4ef86..334a530a2 100644 --- a/cmd/client_incremental.go +++ b/cmd/client_incremental.go @@ -17,87 +17,107 @@ package cmd import ( + "context" "encoding/hex" "fmt" + "github.com/bbva/qed/client" "github.com/bbva/qed/hashing" + "github.com/bbva/qed/log" "github.com/bbva/qed/protocol" + "github.com/octago/sflags/gen/gpflag" "github.com/spf13/cobra" ) -func newIncrementalCommand(ctx *clientContext, clientPreRun func(*cobra.Command, []string)) *cobra.Command { - - var start, end uint64 - var verify bool - - cmd := &cobra.Command{ - Use: "incremental", - Short: "Query for incremental", - Long: `Query for an incremental proof to the authenticated data structure. - It also verifies the proofs provided by the server if flag enabled.`, - PreRunE: func(cmd *cobra.Command, args []string) error { - // WARN: PersitentPreRun can't be nested and we're using it in - // cmd/root so inbetween preRuns must be curried. - clientPreRun(cmd, args) - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - - fmt.Printf("\nQuerying incremental between versions [ %d ] and [ %d ]\n", start, end) - // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 - cmd.SilenceUsage = true - proof, err := ctx.client.Incremental(start, end) - if err != nil { - return err - } +var clientIncrementalCmd *cobra.Command = &cobra.Command{ + Use: "incremental", + Short: "Query for incremental proof", + Long: `Query for an incremental proof to the authenticated data structure. +It also verifies the proofs provided by the server if flag enabled.`, + RunE: runClientIncremental, +} - fmt.Printf("\nReceived incremental proof: \n\n") - fmt.Printf(" Start version: %d\n", proof.Start) - fmt.Printf(" End version: %d\n", proof.End) - fmt.Printf(" Incremental audit path: \n\n") - - if verify { - - var startDigest, endDigest string - for { - startDigest = readLine(fmt.Sprintf("Please, provide the starting historyDigest for version [ %d ]: ", start)) - if startDigest != "" { - break - } - } - for { - endDigest = readLine(fmt.Sprintf("Please, provide the ending historyDigest for version [ %d ] : ", end)) - if endDigest != "" { - break - } - } - - sdBytes, _ := hex.DecodeString(startDigest) - edBytes, _ := hex.DecodeString(endDigest) - startSnapshot := &protocol.Snapshot{sdBytes, nil, start, nil} - endSnapshot := &protocol.Snapshot{edBytes, nil, end, nil} - - fmt.Printf("\nVerifying with snapshots: \n") - fmt.Printf(" HistoryDigest for start version [ %d ]: %s\n", start, startDigest) - fmt.Printf(" HistoryDigest for end version [ %d ]: %s\n", end, endDigest) - - if ctx.client.VerifyIncremental(proof, startSnapshot, endSnapshot, hashing.NewSha256Hasher()) { - fmt.Printf("\nVerify: OK\n\n") - } else { - fmt.Printf("\nVerify: KO\n\n") - } - } +var clientIncrementalCtx context.Context + +func init() { + clientIncrementalCtx = configClientIncremental() + clientCmd.AddCommand(clientIncrementalCmd) +} + +type incrementalParams struct { + Start uint64 `desc:"Starting version for the incremental proof"` + End uint64 `desc:"Endind version for the incremental proof"` + Verify bool `desc:"Set to enable proof verification process"` +} + +func configClientIncremental() context.Context { + + conf := &incrementalParams{} - return nil - }, + err := gpflag.ParseTo(conf, clientIncrementalCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) } + return context.WithValue(Ctx, k("client.incremental.params"), conf) +} + +func runClientIncremental(cmd *cobra.Command, args []string) error { + + // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 + cmd.SilenceUsage = true + params := clientIncrementalCtx.Value(k("client.incremental.params")).(*incrementalParams) + fmt.Printf("\nQuerying incremental between versions [ %d ] and [ %d ]\n", params.Start, params.End) - cmd.Flags().Uint64Var(&start, "start", 0, "Start version to query") - cmd.Flags().Uint64Var(&end, "end", 0, "End version to query") - cmd.Flags().BoolVar(&verify, "verify", false, "Do verify received proof") - cmd.MarkFlagRequired("start") - cmd.MarkFlagRequired("end") + clientConfig := clientCtx.Value(k("client.config")).(*client.Config) - return cmd + client, err := client.NewHTTPClientFromConfig(clientConfig) + if err != nil { + return err + } + + proof, err := client.Incremental(params.Start, params.End) + if err != nil { + return err + } + + fmt.Printf("\nReceived incremental proof: \n\n") + fmt.Printf(" Start version: %d\n", proof.Start) + fmt.Printf(" End version: %d\n", proof.End) + fmt.Printf(" Incremental audit path: \n\n") + + if params.Verify { + + var startDigest, endDigest string + for { + startDigest = readLine(fmt.Sprintf("Please, provide the starting historyDigest for version [ %d ]: ", params.Start)) + if startDigest != "" { + break + } + } + for { + endDigest = readLine(fmt.Sprintf("Please, provide the ending historyDigest for version [ %d ] : ", params.End)) + if endDigest != "" { + break + } + } + + sdBytes, _ := hex.DecodeString(startDigest) + edBytes, _ := hex.DecodeString(endDigest) + startSnapshot := &protocol.Snapshot{sdBytes, nil, params.Start, nil} + endSnapshot := &protocol.Snapshot{edBytes, nil, params.End, nil} + + fmt.Printf("\nVerifying with snapshots: \n") + fmt.Printf(" HistoryDigest for start version [ %d ]: %s\n", params.Start, startDigest) + fmt.Printf(" HistoryDigest for end version [ %d ]: %s\n", params.End, endDigest) + + if client.VerifyIncremental(proof, startSnapshot, endSnapshot, hashing.NewSha256Hasher()) { + fmt.Printf("\nVerify: OK\n\n") + } else { + fmt.Printf("\nVerify: KO\n\n") + } + } + + return nil } + diff --git a/cmd/client_membership.go b/cmd/client_membership.go index c59c7fda2..109d35fd1 100644 --- a/cmd/client_membership.go +++ b/cmd/client_membership.go @@ -18,116 +18,131 @@ package cmd import ( "bufio" + "context" "encoding/hex" "fmt" "os" "strings" + "github.com/octago/sflags/gen/gpflag" "github.com/spf13/cobra" + "github.com/bbva/qed/client" "github.com/bbva/qed/hashing" "github.com/bbva/qed/log" "github.com/bbva/qed/protocol" ) -func newMembershipCommand(ctx *clientContext, clientPreRun func(*cobra.Command, []string)) *cobra.Command { +var clientMembershipCmd *cobra.Command = &cobra.Command{ + Use: "membership", + Short: "Query for membership", + Long: `Query for membership of an event to the authenticated data structure. +It also verifies the proofs provided by the server if flag enabled.`, + RunE: runClientMembership, +} + +var clientMembershipCtx context.Context + +func init() { + clientMembershipCtx = configClientMembership() + clientCmd.AddCommand(clientMembershipCmd) +} + +type membershipParams struct { + Version uint64 `desc:"Version for the membership proof"` + Verify bool `desc:"Set to enable proof verification process"` + Event string `desc:"QED event to build the proof"` + EventDigest string `desc:"QED event digest to build the proof"` +} + +func configClientMembership() context.Context { + + conf := &membershipParams{} + + err := gpflag.ParseTo(conf, clientMembershipCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) + } + return context.WithValue(Ctx, k("client.membership.params"), conf) +} + +func runClientMembership(cmd *cobra.Command, args []string) error { hasherF := hashing.NewSha256Hasher - var version uint64 - var verify bool - var key, eventDigest string - - cmd := &cobra.Command{ - Use: "membership", - Short: "Query for membership", - Long: `Query for membership of an event to the authenticated data structure. - It also verifies the proofs provided by the server if flag enabled.`, - PreRunE: func(cmd *cobra.Command, args []string) error { - // WARN: PersitentPreRun can't be nested and we're using it in - // cmd/root so inbetween preRuns must be curried. - clientPreRun(cmd, args) - - if key == "" && eventDigest == "" { - log.Errorf("Error: trying to get membership without either key or eventDigest") - } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - var membershipResult *protocol.MembershipResult - var digest hashing.Digest - var err error - // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 - cmd.SilenceUsage = true - - if eventDigest == "" { - fmt.Printf("\nQuerying key [ %s ] with version [ %d ]\n", key, version) - digest = hasherF().Do([]byte(key)) - } else { - fmt.Printf("\nQuerying digest [ %s ] with version [ %d ]\n", eventDigest, version) - digest, _ = hex.DecodeString(eventDigest) - } + var membershipResult *protocol.MembershipResult + var digest hashing.Digest + var err error - membershipResult, err = ctx.client.MembershipDigest(digest, version) - if err != nil { - return err - } - fmt.Printf("\nReceived membership proof:\n") - fmt.Printf("\n Exists: %t\n", membershipResult.Exists) - fmt.Printf(" Hyper audit path: \n") - fmt.Printf(" History audit path: \n") - fmt.Printf(" CurrentVersion: %d\n", membershipResult.CurrentVersion) - fmt.Printf(" QueryVersion: %d\n", membershipResult.QueryVersion) - fmt.Printf(" ActualVersion: %d\n", membershipResult.ActualVersion) - fmt.Printf(" KeyDigest: %x\n\n", membershipResult.KeyDigest) - - if verify { - - var hyperDigest, historyDigest string - for { - hyperDigest = readLine(fmt.Sprintf("Please, provide the hyperDigest for current version [ %d ]: ", membershipResult.CurrentVersion)) - if hyperDigest != "" { - break - } - } - if membershipResult.Exists { - for { - historyDigest = readLine(fmt.Sprintf("Please, provide the historyDigest for version [ %d ] : ", version)) - if historyDigest != "" { - break - } - } - } + params := clientMembershipCtx.Value(k("client.membership.params")).(*membershipParams) - hdBytes, _ := hex.DecodeString(hyperDigest) - htdBytes, _ := hex.DecodeString(historyDigest) - snapshot := &protocol.Snapshot{ - HistoryDigest: htdBytes, - HyperDigest: hdBytes, - Version: version, - EventDigest: digest} - - fmt.Printf("\nVerifying with Snapshot: \n\n EventDigest:%x\n HyperDigest: %s\n HistoryDigest: %s\n Version: %d\n", - digest, hyperDigest, historyDigest, version) - - if ctx.client.DigestVerify(membershipResult, snapshot, hasherF) { - fmt.Printf("\nVerify: OK\n\n") - } else { - fmt.Printf("\nVerify: KO\n\n") - } - } - return nil - }, + // SilenceUsage is set to true -> https://github.com/spf13/cobra/issues/340 + cmd.SilenceUsage = true + + if params.EventDigest == "" { + fmt.Printf("\nQuerying key [ %s ] with version [ %d ]\n", params.Event, params.Version) + digest = hasherF().Do([]byte(params.Event)) + } else { + fmt.Printf("\nQuerying digest [ %s ] with version [ %d ]\n", params.EventDigest, params.Version) + digest, _ = hex.DecodeString(params.EventDigest) } - cmd.Flags().StringVar(&key, "key", "", "Key to query") - cmd.Flags().Uint64Var(&version, "version", 0, "Version to query") - cmd.Flags().BoolVar(&verify, "verify", false, "Do verify received proof") - cmd.Flags().StringVar(&eventDigest, "eventDigest", "", "Digest of the event") + config := clientCtx.Value(k("client.config")).(*client.Config) - cmd.MarkFlagRequired("version") + client, err := client.NewHTTPClientFromConfig(config) + if err != nil { + return err + } - return cmd + membershipResult, err = client.MembershipDigest(digest, params.Version) + if err != nil { + return err + } + fmt.Printf("\nReceived membership proof:\n") + fmt.Printf("\n Exists: %t\n", membershipResult.Exists) + fmt.Printf(" Hyper audit path: \n") + fmt.Printf(" History audit path: \n") + fmt.Printf(" CurrentVersion: %d\n", membershipResult.CurrentVersion) + fmt.Printf(" QueryVersion: %d\n", membershipResult.QueryVersion) + fmt.Printf(" ActualVersion: %d\n", membershipResult.ActualVersion) + fmt.Printf(" KeyDigest: %x\n\n", membershipResult.KeyDigest) + + if params.Verify { + + var hyperDigest, historyDigest string + for { + hyperDigest = readLine(fmt.Sprintf("Please, provide the hyperDigest for current version [ %d ]: ", membershipResult.CurrentVersion)) + if hyperDigest != "" { + break + } + } + if membershipResult.Exists { + for { + historyDigest = readLine(fmt.Sprintf("Please, provide the historyDigest for version [ %d ] : ", params.Version)) + if historyDigest != "" { + break + } + } + } + + hdBytes, _ := hex.DecodeString(hyperDigest) + htdBytes, _ := hex.DecodeString(historyDigest) + snapshot := &protocol.Snapshot{ + HistoryDigest: htdBytes, + HyperDigest: hdBytes, + Version: params.Version, + EventDigest: digest} + + fmt.Printf("\nVerifying with Snapshot: \n\n EventDigest:%x\n HyperDigest: %s\n HistoryDigest: %s\n Version: %d\n", + digest, hyperDigest, historyDigest, params.Version) + + if client.DigestVerify(membershipResult, snapshot, hasherF) { + fmt.Printf("\nVerify: OK\n\n") + } else { + fmt.Printf("\nVerify: KO\n\n") + } + } + return nil } func readLine(query string) string { diff --git a/cmd/root.go b/cmd/root.go index e1e4f4f11..c218de4c2 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -18,87 +18,21 @@ package cmd import ( - "net/http" + "context" _ "net/http/pprof" // this will enable the default profiling capabilities - "github.com/bbva/qed/log" - homedir "github.com/mitchellh/go-homedir" "github.com/spf13/cobra" - v "github.com/spf13/viper" ) -// NewRootCommand is the main Parser for the qed cli. -func NewRootCommand(args []string) *cobra.Command { - ctx := &cmdContext{} +// Context key type to be used when adding values to context +// as per documentation: +// https://golang.org/pkg/context/#example_WithValue +type k string - cmd := &cobra.Command{ - Use: "qed", - Short: "QED is a client for the verifiable log server", - // TraverseChildren: true, - PersistentPreRun: func(cmd *cobra.Command, args []string) { - if ctx.configFile != "" { - v.SetConfigFile(ctx.configFile) - } else { - v.SetConfigName("config") - v.AddConfigPath(ctx.path) - v.AddConfigPath(".") - } - - if !ctx.disableConfig { - // read in environment variables that match. - // ex: `QED_API_KEY=environ-key` - v.SetEnvPrefix("QED") - v.AutomaticEnv() - - err := v.ReadInConfig() - if _, ok := err.(v.ConfigFileNotFoundError); err != nil && !ok { - log.Error("Can't read config file.", err) - } - - // Runtime Binding - ctx.logLevel = v.GetString("log") - ctx.apiKey = v.GetString("api_key") - ctx.path, err = homedir.Expand(v.GetString("path")) - if err != nil { - log.Fatalf("Can't expand global path: %v", err) - } - - } - - ctx.profiling = v.GetBool("profiling") - if ctx.profiling { - go func() { - if err := http.ListenAndServe("0.0.0.0:6060", nil); err != http.ErrServerClosed { - log.Errorf("Can't start profiling HTTP server: %s", err) - } - }() - - } - - markStringRequired(ctx.apiKey, "apikey") - - }, - } - - f := cmd.PersistentFlags() - f.StringVarP(&ctx.configFile, "config-file", "c", "", "Qed config file") - f.BoolVarP(&ctx.disableConfig, "no-conf", "n", false, "Disable config file loading") - f.StringVarP(&ctx.logLevel, "log", "l", "error", "Choose between log levels: silent, error, info and debug") - f.StringVarP(&ctx.apiKey, "apikey", "k", "", "Server api key") - f.StringVarP(&ctx.path, "path", "p", "/var/tmp/qed", "Qed root path for storage configuration and credentials") - f.BoolVarP(&ctx.profiling, "profiling", "f", false, "Allow a pprof url (localhost:6060) for profiling purposes") - - // Lookups - v.BindPFlag("log", f.Lookup("log")) - v.BindPFlag("api_key", f.Lookup("apikey")) - v.BindPFlag("path", f.Lookup("path")) - v.BindPFlag("profiling", f.Lookup("profiling")) - - cmd.AddCommand( - newStartCommand(ctx), - newClientCommand(ctx), - newAgentCommand(ctx, args), - ) - - return cmd +var Root *cobra.Command = &cobra.Command{ + Use: "qed", + Short: "QED system", + Long: "QED implements an authenticated data structure as an append-only log. This command exposes the QED components. Please refer to QED manual to learn about QED architecture and its components", } + +var Ctx context.Context = context.WithValue(context.Background(), k("version"), "alpha") diff --git a/cmd/server.go b/cmd/server.go new file mode 100644 index 000000000..d85520de0 --- /dev/null +++ b/cmd/server.go @@ -0,0 +1,51 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cmd + +import ( + "context" + + "github.com/bbva/qed/log" + "github.com/bbva/qed/server" + "github.com/octago/sflags/gen/gpflag" + "github.com/spf13/cobra" +) + +var serverCmd *cobra.Command = &cobra.Command{ + Use: "server", + Short: "Provices access to the QED log server commands", + Long: `QED serves provides a REST API to the QED Log. The API is documented +elsewhere.`, + TraverseChildren: true, +} + +var serverCtx context.Context = configServer() + +func init() { + Root.AddCommand(serverCmd) +} + +func configServer() context.Context { + + conf := server.DefaultConfig() + + err := gpflag.ParseTo(conf, serverCmd.PersistentFlags()) + if err != nil { + log.Fatalf("err: %v", err) + } + return context.WithValue(Ctx, k("server.config"), conf) +} diff --git a/cmd/server_start.go b/cmd/server_start.go new file mode 100644 index 000000000..78f692e88 --- /dev/null +++ b/cmd/server_start.go @@ -0,0 +1,73 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cmd + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" + + "github.com/bbva/qed/log" + "github.com/bbva/qed/server" + "github.com/bbva/qed/util" +) + +var serverStart *cobra.Command = &cobra.Command{ + Use: "start", + Short: "Stars QED log service", + Run: runServerStart, +} + +func init() { + serverCmd.AddCommand(serverStart) +} + +func runServerStart(cmd *cobra.Command, args []string) { + var err error + + conf := serverCtx.Value(k("server.config")).(*server.Config) + + if conf.SSLCertificate != "" && conf.SSLCertificateKey != "" { + if _, err := os.Stat(conf.SSLCertificate); os.IsNotExist(err) { + log.Infof("Can't find certificate .crt file: %v", err) + } else if _, err := os.Stat(conf.SSLCertificateKey); os.IsNotExist(err) { + log.Infof("Can't find certificate .key file: %v", err) + } else { + log.Info("EnabledTLS") + conf.EnableTLS = true + } + } + + log.SetLogger("server", conf.Log) + fmt.Printf("CONF: %+v\n", conf) + srv, err := server.NewServer(conf) + if err != nil { + log.Fatalf("Can't start QED server: %v", err) + } + + err = srv.Start() + if err != nil { + log.Fatalf("Can't start QED server: %v", err) + } + + util.AwaitTermSignal(srv.Stop) + + log.Debug("Stopping server, about to exit...") + +} + diff --git a/cmd/start.go b/cmd/start.go deleted file mode 100644 index 500d03fe1..000000000 --- a/cmd/start.go +++ /dev/null @@ -1,116 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package cmd - -import ( - "fmt" - "os" - - homedir "github.com/mitchellh/go-homedir" - "github.com/spf13/cobra" - v "github.com/spf13/viper" - - "github.com/bbva/qed/log" - "github.com/bbva/qed/server" -) - -func newStartCommand(ctx *cmdContext) *cobra.Command { - conf := server.DefaultConfig() - - cmd := &cobra.Command{ - Use: "start", - Short: "Start the server for the verifiable log QED", - Run: func(cmd *cobra.Command, args []string) { - var err error - - log.SetLogger("QEDServer", ctx.logLevel) - - // Bindings - conf.APIKey = ctx.apiKey - conf.NodeID = v.GetString("server.node-id") - conf.PrivateKeyPath, _ = homedir.Expand(v.GetString("server.key")) - conf.SSLCertificate, _ = homedir.Expand(v.GetString("server.tls.certificate")) - conf.SSLCertificateKey, _ = homedir.Expand(v.GetString("server.tls.certificate_key")) - conf.HTTPAddr = v.GetString("server.addr.http") - conf.RaftAddr = v.GetString("server.addr.raft") - conf.MgmtAddr = v.GetString("server.addr.mgmt") - conf.MetricsAddr = v.GetString("server.addr.metrics") - conf.RaftJoinAddr = v.GetStringSlice("server.addr.raft_join") - conf.GossipAddr = v.GetString("server.addr.gossip") - conf.GossipJoinAddr = v.GetStringSlice("server.addr.gossip_join") - conf.DBPath = fmt.Sprintf("%s/%s", ctx.path, "db") - conf.RaftPath = fmt.Sprintf("%s/%s", ctx.path, "wal") - - if conf.SSLCertificate != "" && conf.SSLCertificateKey != "" { - if _, err := os.Stat(conf.SSLCertificate); os.IsNotExist(err) { - log.Infof("Can't find certificate .crt file: %v", err) - } else if _, err := os.Stat(conf.SSLCertificateKey); os.IsNotExist(err) { - log.Infof("Can't find certificate .key file: %v", err) - } else { - log.Info("EnabledTLS") - conf.EnableTLS = true - } - } - - // cmd.DisableSuggestions = true - srv, err := server.NewServer(conf) - if err != nil { - log.Fatalf("Can't start QED server: %v", err) - } - - err = srv.Start() - if err != nil { - log.Fatalf("Can't start QED server: %v", err) - } - - }, - } - - f := cmd.Flags() - hostname, _ := os.Hostname() - f.StringVar(&conf.NodeID, "node-id", hostname, "Unique name for node. If not set, fallback to hostname") - f.StringVar(&conf.PrivateKeyPath, "keypath", fmt.Sprintf("%s/%s", ctx.path, "id_ed25519"), "Server Singning private key file path") - f.StringVar(&conf.SSLCertificate, "certificate", fmt.Sprintf("%s/%s", ctx.path, "server.crt"), "Server crt file") - f.StringVar(&conf.SSLCertificateKey, "certificate-key", fmt.Sprintf("%s/%s", ctx.path, "server.key"), "Server key file") - - f.StringVar(&conf.HTTPAddr, "http-addr", ":8800", "Endpoint for REST requests on (host:port)") - f.StringVar(&conf.RaftAddr, "raft-addr", ":8500", "Raft bind address (host:port)") - f.StringVar(&conf.MgmtAddr, "mgmt-addr", ":8700", "Management endpoint bind address (host:port)") - f.StringVar(&conf.MetricsAddr, "metrics-addr", ":8600", "Metrics export bind address (host:port)") - f.StringSliceVar(&conf.RaftJoinAddr, "join-addr", []string{}, "Raft: Comma-delimited list of nodes ([host]:port), through which a cluster can be joined") - f.StringVar(&conf.GossipAddr, "gossip-addr", ":8400", "Gossip: management endpoint bind address (host:port)") - f.StringSliceVar(&conf.GossipJoinAddr, "gossip-join-addr", []string{}, "Gossip: Comma-delimited list of nodes ([host]:port), through which a cluster can be joined") - - // Lookups - v.BindPFlag("server.node-id", f.Lookup("node-id")) - v.BindPFlag("server.key", f.Lookup("keypath")) - v.BindPFlag("server.tls.certificate", f.Lookup("certificate")) - v.BindPFlag("server.tls.certificate_key", f.Lookup("certificate-key")) - - v.BindPFlag("server.addr.http", f.Lookup("http-addr")) - v.BindPFlag("server.addr.mgmt", f.Lookup("mgmt-addr")) - v.BindPFlag("server.addr.metrics", f.Lookup("metrics-addr")) - v.BindPFlag("server.addr.raft", f.Lookup("raft-addr")) - v.BindPFlag("server.addr.raft_join", f.Lookup("join-addr")) - v.BindPFlag("server.addr.gossip", f.Lookup("gossip-addr")) - v.BindPFlag("server.addr.gossip_join", f.Lookup("gossip-join-addr")) - - v.BindPFlag("server.path.db", f.Lookup("dbpath")) - v.BindPFlag("server.path.wal", f.Lookup("raftpath")) - - return cmd -} diff --git a/deploy/aws/provision/files/grafana/dashboards/Host.json b/deploy/aws/provision/files/grafana/dashboards/Host.json index 5839aea60..2aad539d3 100644 --- a/deploy/aws/provision/files/grafana/dashboards/Host.json +++ b/deploy/aws/provision/files/grafana/dashboards/Host.json @@ -16,7 +16,8 @@ "editable": true, "gnetId": 179, "graphTooltip": 2, - "iteration": 1554368693970, + "id": 8, + "iteration": 1554910364728, "links": [], "panels": [ { @@ -917,6 +918,99 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 39, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_written_bytes_total[30s])) by (job,device)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "written-{{job}}@{{device}}", + "refId": "A" + }, + { + "expr": "-sum(rate(node_disk_read_bytes_total[30s])) by (job,device)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read-{{job}}@{{device}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Read & Write Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "10s", @@ -1030,5 +1124,5 @@ "timezone": "browser", "title": "Hosts Monitoring", "uid": "64nrElFmk", - "version": 1 + "version": 3 } \ No newline at end of file diff --git a/deploy/aws/provision/tasks/qed/main.yml b/deploy/aws/provision/tasks/qed/main.yml index a6b6194b8..2322d1c3d 100644 --- a/deploy/aws/provision/tasks/qed/main.yml +++ b/deploy/aws/provision/tasks/qed/main.yml @@ -20,15 +20,6 @@ with_items: - qed -- name: Create QED config - template: - dest: /var/qed/{{ item }} - src: ../../templates/qed-{{ item }}.j2 - force: true - with_items: - - config.yml - register: qed_config - - name: Create QED start|stop script template: dest: /var/qed/{{ item }} diff --git a/deploy/aws/provision/templates/prometheus-config.yml.j2 b/deploy/aws/provision/templates/prometheus-config.yml.j2 index ea3624117..328d87c11 100644 --- a/deploy/aws/provision/templates/prometheus-config.yml.j2 +++ b/deploy/aws/provision/templates/prometheus-config.yml.j2 @@ -37,7 +37,7 @@ scrape_configs: - job_name: 'Publisher{{loop.index0}}' scrape_interval: 10s static_configs: - - targets: ['{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:18300'] + - targets: ['{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:18100'] - job_name: 'Publisher{{loop.index0}}-Host' scrape_interval: 10s static_configs: @@ -49,7 +49,7 @@ scrape_configs: - job_name: 'Monitor{{loop.index0}}' scrape_interval: 10s static_configs: - - targets: ['{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:18200'] + - targets: ['{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:18100'] - job_name: 'Monitor{{loop.index0}}-Host' scrape_interval: 10s static_configs: diff --git a/deploy/aws/provision/templates/qed-config.yml.j2 b/deploy/aws/provision/templates/qed-config.yml.j2 deleted file mode 100644 index f82dcd291..000000000 --- a/deploy/aws/provision/templates/qed-config.yml.j2 +++ /dev/null @@ -1,95 +0,0 @@ -{# - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -#} ---- -log: info -api_key: "terraform_qed" -path: "/var/qed/" -{% if qed_profiling is defined %} -profiling: true -{% else %} -profiling: false -{% endif %} -{% if 'role_qed' in group_names %} -server: - node_id: "{{ ansible_hostname }}" - addr: - http: "{{ ansible_eth0.ipv4.address }}:8800" - mgmt: "{{ ansible_eth0.ipv4.address }}:8700" - raft: "{{ ansible_eth0.ipv4.address }}:8500" - gossip: "{{ ansible_eth0.ipv4.address }}:8400" -{% if groups.role_qed.index(inventory_hostname) != 0 %} - raft_join: -{% for host in groups['name_qed-0'] %} - - "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8700" - gossip_join: - - "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400" -{% endfor %} -{% endif %} -{% endif %} -{% if 'role_monitor' in group_names %} -agent: - node: "monitor-{{ ansible_hostname }}" - bind: "{{ ansible_eth0.ipv4.address }}:8200" - metrics: "{{ ansible_eth0.ipv4.address }}:18200" - join: -{% for host in groups['name_qed-0'] %} - - "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400" -{% endfor %} - server_urls: - - "{% for host in groups['role_qed'] %}http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8800{% if not loop.last %},{% endif %}{% endfor %}" -{% for host in groups['role_storage'] %} - alerts_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" - snapshots_store_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" -{% endfor %} -{% endif %} -{% if 'role_auditor' in group_names %} -agent: - node: "auditor-{{ ansible_hostname }}" - bind: "{{ ansible_eth0.ipv4.address }}:8100" - metrics: "{{ ansible_eth0.ipv4.address }}:18100" - join: -{% for host in groups['name_qed-0'] %} - - "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400" -{% endfor %} - server_urls: - - "{% for host in groups['role_qed'] %}http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8800{% if not loop.last %},{% endif %}{% endfor %}" -{% for host in groups['role_storage'] %} - alerts_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" - snapshots_store_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" -{% endfor %} -{% endif %} -{% if 'role_publisher' in group_names %} -agent: - node: "publisher-{{ ansible_hostname }}" - bind: "{{ ansible_eth0.ipv4.address }}:8300" - metrics: "{{ ansible_eth0.ipv4.address }}:18300" - join: -{% for host in groups['name_qed-0'] %} - - "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400" -{% endfor %} - server_urls: - - "{% for host in groups['role_qed'] %}http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8800{% if not loop.last %},{% endif %}{% endfor %}" -{% for host in groups['role_storage'] %} - alerts_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" - snapshots_store_urls: - - "http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888" -{% endfor %} -{% endif %} diff --git a/deploy/aws/provision/templates/qed-start.sh.j2 b/deploy/aws/provision/templates/qed-start.sh.j2 index 1b1b2b789..f7e25c35b 100644 --- a/deploy/aws/provision/templates/qed-start.sh.j2 +++ b/deploy/aws/provision/templates/qed-start.sh.j2 @@ -15,18 +15,52 @@ #} #!/bin/bash -ulimit -n 819200 - export QED_HOME=/var/qed {% if 'role_qed' in group_names %} -$QED_HOME/qed start +$QED_HOME/qed server start \ +--log info \ +--api-key key \ +--db-path /var/qed/db \ +--gossip-addr "{{ ansible_eth0.ipv4.address }}:8400" \ +--http-addr "{{ ansible_eth0.ipv4.address }}:8800" \ +--metrics-addr "{{ ansible_eth0.ipv4.address }}:8600" \ +--mgmt-addr "{{ ansible_eth0.ipv4.address }}:8700" \ +--node-id server-{{ansible_hostname}} \ +--private-key-path /var/qed/id_ed25519 \ +--raft-addr "{{ ansible_eth0.ipv4.address }}:8500" \ +{% if groups.role_qed.index(inventory_hostname) != 0 %} +{% for host in groups['name_qed-0'] %} +--raft-join-addr "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8700" \ +--gossip-join-addr "{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400" \ +{% endfor %} +{% endif %} +--raft-path /var/qed/wal {% endif %} + {% if 'role_monitor' in group_names %} -$QED_HOME/qed agent monitor +$QED_HOME/qed agent monitor \ +--role monitor \ +--node-name monitor-{{ansible_hostname}} \ +--qed-endpoints "{% for host in groups['role_qed'] %}http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8800{% if not loop.last %},{% endif %}{% endfor %}" \ {% endif %} {% if 'role_auditor' in group_names %} -$QED_HOME/qed agent auditor +$QED_HOME/qed agent auditor \ +--role auditor \ +--node-name auditor-{{ansible_hostname}} \ +--qed-endpoints "{% for host in groups['role_qed'] %}http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8800{% if not loop.last %},{% endif %}{% endfor %}" \ {% endif %} {% if 'role_publisher' in group_names %} -$QED_HOME/qed agent publisher +$QED_HOME/qed agent publisher \ +--role publisher \ +--node-name publisher-{{ansible_hostname}} \ +{% endif %} +{% if 'role_monitor' in group_names or 'role_auditor' in group_names or 'role_publisher' in group_names %} +--bind-addr "{{ ansible_eth0.ipv4.address }}:8100" \ +--metrics-addr "{{ ansible_eth0.ipv4.address }}:18100" \ +--start-join "{% for host in groups['role_qed'] %}{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8400{% if not loop.last %},{% endif %}{% endfor %}" \ +{% for host in groups['role_storage'] %} +--notifier-endpoint http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888 \ +--store-endpoint http://{{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}:8888 \ +--log info +{% endfor %} {% endif %} diff --git a/deploy/aws/provision/templates/qed.service.j2 b/deploy/aws/provision/templates/qed.service.j2 index 1d5690026..3f29bc55d 100644 --- a/deploy/aws/provision/templates/qed.service.j2 +++ b/deploy/aws/provision/templates/qed.service.j2 @@ -29,7 +29,7 @@ RuntimeDirectory=root RuntimeDirectoryMode=0750 ExecStart=/var/qed/qed-start.sh ExecStop=/var/qed/qed-stop.sh -LimitNOFILE=10000 +LimitNOFILE=819200 TimeoutStopSec=20 [Install] diff --git a/go.mod b/go.mod index 2a627cc5c..da1d31012 100644 --- a/go.mod +++ b/go.mod @@ -19,10 +19,12 @@ require ( github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/kr/pretty v0.1.0 // indirect github.com/mitchellh/go-homedir v1.1.0 + github.com/octago/sflags v0.2.0 github.com/pkg/errors v0.8.0 github.com/prometheus/client_golang v0.9.2 github.com/prometheus/procfs v0.0.0-20190328153300-af7bedc223fb // indirect github.com/spf13/cobra v0.0.3 + github.com/spf13/pflag v1.0.3 github.com/spf13/viper v1.3.1 github.com/stretchr/testify v1.2.2 golang.org/x/crypto v0.0.0-20190228161510-8dd112bcdc25 diff --git a/go.sum b/go.sum index aaa15c07a..ad5c5ec54 100644 --- a/go.sum +++ b/go.sum @@ -71,6 +71,8 @@ github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/octago/sflags v0.2.0 h1:XceYzkRXGAHa/lSFmKLcaxSrsh4MTuOMQdIGsUD0wlk= +github.com/octago/sflags v0.2.0/go.mod h1:G0bjdxh4qPRycF74a2B8pU36iTp9QHGx0w0dFZXPt80= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c h1:Lgl0gzECD8GnQ5QCWA8o6BtfL6mDH5rQgM4/fX3avOs= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= diff --git a/gossip/agent.go b/gossip/agent.go index 13e349ab6..96661217a 100644 --- a/gossip/agent.go +++ b/gossip/agent.go @@ -21,165 +21,300 @@ import ( "sync" "time" - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/hashing" + "github.com/bbva/qed/client" "github.com/bbva/qed/log" "github.com/bbva/qed/metrics" - "github.com/bbva/qed/protocol" - "github.com/coocood/freecache" "github.com/hashicorp/memberlist" + "github.com/prometheus/client_golang/prometheus" ) -type hashedBatch struct { - batch *protocol.BatchSnapshots - digest hashing.Digest -} - +// Agent exposes the necesary API to interact with +// the gossip network, the snapshot store, the +// QED log and the alerts store. +// +// The agent API enables QED users to implement +// and integrate its own tools and services with +// QED. type Agent struct { - config *Config - Self *member.Peer + // stateLock is used to protect critical information + // from cocurrent access from the gossip + // network + stateLock sync.Mutex + + // parameters from command line + // interface + config Config - metricsServer *metrics.Server + // Self stores the peer information corresponding to + // this agent instance. It is used to make routing + // decissions. + Self *Peer - memberlist *memberlist.Memberlist + // metricsServer exposes an HTTP service with + // all the its metrics and also its processors + // metrics. + metrics *metrics.Server + + // gossip gives access to the + // memberlist API to interact with the network + // and its members + gossip *memberlist.Memberlist + + // broadcasts gives access to the + // broadcast gossip API broadcasts *memberlist.TransmitLimitedQueue - Topology *Topology + // Topology holds the network topology + // as this agent instance sees it. + topology *Topology - stateLock sync.Mutex + // processors enqueue tasks to be executed by + // the tasks manager. They need to create + // the context for each task to be able to execute. + processors map[string]Processor + + // timeout signals when the default timeout has passed + // to end an enqueue operation + timeout *time.Ticker + + // A cached KV to be used by processors and tasks + Cache Cache + + // In channel receives messages from the gossip + // network to be processed by the agent + In MessageBus + + // Out channel enqueue the messages to be forwarded to + // other gossip agents + Out MessageBus + + // quitCh channels signal the graceful shutdown + // of the agent + quitCh chan bool + + // Client to a running QED + Qed *client.HTTPClient + + //Client to a notification service + Notifier Notifier + + // Client to a snapshot store service + SnapshotStore SnapshotStore + + //Client to a task manager service + Tasks TasksManager +} + +// Creates new agent from a configuration object +// It does not create external clients like QED, SnapshotStore or Notifier, nor +// a task manager. +func NewAgentFromConfig(conf *Config) (agent *Agent, err error) { + options, err := configToOptions(conf) + if err != nil { + return nil, err + } + return NewAgent(options...) +} - processed *freecache.Cache - processors []Processor +// Returns a new agent with all the APIs initialized and +// with a cache of size bytes. +func NewDefaultAgent(conf *Config, qed *client.HTTPClient, s SnapshotStore, t TasksManager, n Notifier) (*Agent, error) { + options, err := configToOptions(conf) + if err != nil { + return nil, err + } + options = append(options, SetQEDClient(qed), SetSnapshotStore(s), SetTasksManager(t), SetNotifier(n)) - In chan *hashedBatch - Out chan *protocol.BatchSnapshots - quit chan bool + return NewAgent(options...) } -func NewAgent(conf *Config, p []Processor, m *metrics.Server) (agent *Agent, err error) { - log.Infof("New agent %s\n", conf.NodeName) - agent = &Agent{ - config: conf, - metricsServer: m, - Topology: NewTopology(), - processors: p, - processed: freecache.NewCache(1 << 20), - In: make(chan *hashedBatch, 1<<16), - Out: make(chan *protocol.BatchSnapshots, 1<<16), - quit: make(chan bool), +// NewAgent returns a configured and started agent or error if +// it cannot be created. +// On return, the agent is already connected to the gossip network +// but it will not process any information. +// It will though enqueue request as soon as it is created. When those +// queues are full, messages will start to be dropped silently. +func NewAgent(options ...AgentOptionF) (*Agent, error) { + agent := &Agent{ + quitCh: make(chan bool), + topology: NewTopology(), + } + + // Run the options on the client + for _, option := range options { + if err := option(agent); err != nil { + return nil, err + } } - bindIP, bindPort, err := conf.AddrParts(conf.BindAddr) + bindIP, bindPort, err := agent.config.AddrParts(agent.config.BindAddr) if err != nil { return nil, fmt.Errorf("Invalid bind address: %s", err) } var advertiseIP string var advertisePort int - if conf.AdvertiseAddr != "" { - advertiseIP, advertisePort, err = conf.AddrParts(conf.AdvertiseAddr) + if agent.config.AdvertiseAddr != "" { + advertiseIP, advertisePort, err = agent.config.AddrParts(agent.config.AdvertiseAddr) if err != nil { return nil, fmt.Errorf("Invalid advertise address: %s", err) } } - conf.MemberlistConfig = memberlist.DefaultLocalConfig() - conf.MemberlistConfig.BindAddr = bindIP - conf.MemberlistConfig.BindPort = bindPort - conf.MemberlistConfig.AdvertiseAddr = advertiseIP - conf.MemberlistConfig.AdvertisePort = advertisePort - conf.MemberlistConfig.Name = conf.NodeName - conf.MemberlistConfig.Logger = log.GetLogger() + agent.config.MemberlistConfig = memberlist.DefaultLocalConfig() + agent.config.MemberlistConfig.BindAddr = bindIP + agent.config.MemberlistConfig.BindPort = bindPort + agent.config.MemberlistConfig.AdvertiseAddr = advertiseIP + agent.config.MemberlistConfig.AdvertisePort = advertisePort + agent.config.MemberlistConfig.Name = agent.config.NodeName + agent.config.MemberlistConfig.Logger = log.GetLogger() + // Configure delegates - conf.MemberlistConfig.Delegate = newAgentDelegate(agent) - conf.MemberlistConfig.Events = &eventDelegate{agent} - agent.Self = member.NewPeer(conf.NodeName, advertiseIP, uint16(advertisePort), conf.Role) + agent.config.MemberlistConfig.Delegate = newAgentDelegate(agent) + agent.config.MemberlistConfig.Events = &eventDelegate{agent} + + agent.Self = NewPeer(agent.config.NodeName, advertiseIP, uint16(advertisePort), agent.config.Role) + + return agent, nil +} - agent.memberlist, err = memberlist.Create(conf.MemberlistConfig) +// Enables the processing engines of the +// agent +func (a *Agent) Start() { + var err error + + if a.metrics != nil { + log.Infof("Starting agent metrics server") + a.metrics.Start() + } + + if a.Tasks != nil { + log.Infof("Starting task mamanger loop") + a.Tasks.Start() + } + + if a.Notifier != nil { + log.Infof("Starting notifier mamanger loop") + a.Notifier.Start() + } + + log.Infof("Starting memberlist gossip netwotk") + a.gossip, err = memberlist.Create(a.config.MemberlistConfig) if err != nil { - return nil, err + log.Infof("Error creating the memberlist network; %v", err) + return } // Print local member info - agent.Self = member.ParsePeer(agent.memberlist.LocalNode()) - log.Infof("Local member %+v", agent.Self) + a.Self = ParsePeer(a.gossip.LocalNode()) + log.Infof("Local member %+v", a.Self) // Set broadcast queue - agent.broadcasts = &memberlist.TransmitLimitedQueue{ + a.broadcasts = &memberlist.TransmitLimitedQueue{ NumNodes: func() int { - return agent.memberlist.NumMembers() + return a.gossip.NumMembers() }, RetransmitMult: 2, } - if p != nil { - go agent.start() - } - - return agent, nil -} - -// Send a batch into a queue channel with the agent TimeoutQueues timeout. -func (a *Agent) ChTimedSend(batch *protocol.BatchSnapshots, ch chan *protocol.BatchSnapshots) { - for { - select { - case <-time.After(a.config.TimeoutQueues): - log.Infof("Agent timed out enqueueing batch in out channel") - return - case ch <- batch: + if len(a.config.StartJoin) > 0 { + log.Infof("Trying to joing gossip network with peers %v", a.config.StartJoin) + n, err := a.Join(a.config.StartJoin) + if n == 0 || err != nil { + log.Errorf("Unable to join gossip network because %v", err) return } + log.Infof("Joined gossip network with %d peers", n) } -} -func (a *Agent) start() { + log.Infof("Starting agent sender loop") + a.sender() +} - for _, p := range a.processors { - p.RegisterMetrics(a.metricsServer) - } +// Register a new processor into the agent, to add some tasks per batch +// to be executed by the task manager. +func (a *Agent) RegisterProcessor(name string, p Processor) { + a.stateLock.Lock() + defer a.stateLock.Unlock() + a.processors[name] = p + a.RegisterMetrics(p.Metrics()) +} - go func() { - a.metricsServer.Start() - }() +// Deregister a processor per name. It will not fail if the +// processor does not exist. +func (a *Agent) DeregisterProcessor(name string) { + a.stateLock.Lock() + defer a.stateLock.Unlock() + delete(a.processors, name) +} - for { - select { - case hashedBatch := <-a.In: - _, err := a.processed.Get(hashedBatch.digest) - if err == nil { - continue - } - a.processed.Set(hashedBatch.digest, []byte{0x0}, 0) +// Register a slice of collectors in the agent metrics server +func (a *Agent) RegisterMetrics(cs []prometheus.Collector) { + a.metrics.MustRegister(cs...) +} - for _, p := range a.processors { - go p.Process(hashedBatch.batch) +// Registers the agent in all output channels to send +// all the messages in the bus to other peers. +// +// Sender will create MaxSenders goroutines to send +// messages for all channels +func (a *Agent) sender() { + var wg sync.WaitGroup + var counter int + for i := 0; i < MAXMESSAGEID; i++ { + ch := make(chan *Message, 255) + a.Out.pool[i] = append(a.Out.pool[i], ch) + + go func(ch chan *Message) { + for { + select { + case msg := <-ch: + // as soon as we have a batch ready for retransmission, we try to send + // it after applying all the routing contraints + go func() { + wg.Add(1) + defer wg.Done() + log.Debugf("Agent sender loop: sending msg!") + a.Send(msg) + }() + if counter >= a.config.MaxSenders { + wg.Wait() + counter = 0 + } + counter++ + case <-a.quitCh: + return + } } - a.ChTimedSend(hashedBatch.batch, a.Out) - case b := <-a.Out: - go a.send(b) - case <-a.quit: - return - } + }(ch) } } -func (a *Agent) send(batch *protocol.BatchSnapshots) { - - if batch.TTL <= 0 { +// Sends a batch using the gossip network reliable transport +// to other nodes based on the routing policy applied +func (a *Agent) Send(msg *Message) { + // if ttl is 0, the message dies here + if msg.TTL == 0 { return } - batch.TTL -= 1 - from := batch.From - batch.From = a.Self - msg, _ := batch.Encode() - for _, dst := range a.route(from) { + msg.TTL-- + wire, err := msg.Encode() + if err != nil { + log.Infof("Agent Send unable to encode message to gossip it") + return + } + msg.From = a.Self + for _, dst := range a.route(msg.From) { log.Debugf("Sending batch to %+v\n", dst.Name) - a.memberlist.SendReliable(dst, msg) + a.gossip.SendReliable(dst, wire) } } -func (a *Agent) route(src *member.Peer) []*memberlist.Node { +// Returns the list of nodes to which a batch can be sent +// given the source of the communication and the internal +// agent topology. +func (a *Agent) route(src *Peer) []*memberlist.Node { var excluded PeerList dst := make([]*memberlist.Node, 0) @@ -187,47 +322,50 @@ func (a *Agent) route(src *member.Peer) []*memberlist.Node { excluded.L = append(excluded.L, src) excluded.L = append(excluded.L, a.Self) - peers := a.Topology.Each(1, &excluded) + peers := a.topology.Each(1, &excluded) for _, p := range peers.L { dst = append(dst, p.Node()) } return dst } -// Join asks the Agent instance to join. +// Join asks the Agent instance to join +// the nodes with the give addrs addresses. func (a *Agent) Join(addrs []string) (int, error) { - if a.State() != member.Alive { + if a.State() != AgentStatusAlive { return 0, fmt.Errorf("Agent can't join after Leave or Shutdown") } if len(addrs) > 0 { - return a.memberlist.Join(addrs) + return a.gossip.Join(addrs) } return 0, nil } +// Leave ask the agent to leave the gossip +// network gracefully, communicating to others +// this agent want to leave func (a *Agent) Leave() error { - // Check the current state a.stateLock.Lock() switch a.Self.Status { - case member.Left: + case AgentStatusLeft: a.stateLock.Unlock() return nil - case member.Leaving: + case AgentStatusLeaving: a.stateLock.Unlock() return fmt.Errorf("Leave already in progress") - case member.Shutdown: + case AgentStatusShutdown: a.stateLock.Unlock() return fmt.Errorf("Leave called after Shutdown") default: - a.Self.Status = member.Leaving + a.Self.Status = AgentStatusLeaving a.stateLock.Unlock() } // Attempt the memberlist leave - err := a.memberlist.Leave(a.config.BroadcastTimeout) + err := a.gossip.Leave(a.config.BroadcastTimeout) if err != nil { return err } @@ -239,17 +377,17 @@ func (a *Agent) Leave() error { // any probes from other agents before they learn about us leaving. time.Sleep(a.config.LeavePropagateDelay) - // Transition to Left only if we not already shutdown + // Transition to AgentStatusLeft only if we not already shutdown a.stateLock.Lock() - if a.Self.Status != member.Shutdown { - a.Self.Status = member.Left + if a.Self.Status != AgentStatusShutdown { + a.Self.Status = AgentStatusLeft } a.stateLock.Unlock() return nil } -// Shutdown forcefully shuts down the Agent instance, stopping all network +// AgentStatusShutdown forcefully shuts down the Agent instance, stopping all network // activity and background maintenance associated with the instance. // // This is not a graceful shutdown, and should be preceded by a call @@ -262,39 +400,58 @@ func (a *Agent) Shutdown() error { a.stateLock.Lock() defer a.stateLock.Unlock() - a.metricsServer.Shutdown() - - if a.Self.Status == member.Shutdown { + if a.Self.Status == AgentStatusShutdown { return nil } - if a.Self.Status != member.Left { + if a.Self.Status != AgentStatusLeft { log.Info("agent: Shutdown without a Leave") } - a.Self.Status = member.Shutdown - err := a.memberlist.Shutdown() + a.Self.Status = AgentStatusShutdown + err := a.gossip.Shutdown() if err != nil { return err } + close(a.quitCh) + + if a.metrics != nil { + a.metrics.Shutdown() + } + + if a.Tasks != nil { + a.Tasks.Stop() + } + + if a.Notifier != nil { + a.Notifier.Stop() + } return nil } +// Returns the memberlist object to manage the gossip api +// directly func (a *Agent) Memberlist() *memberlist.Memberlist { - return a.memberlist + return a.gossip } +// Returns the broadcast facility to manage broadcasts messages +// directly func (a *Agent) Broadcasts() *memberlist.TransmitLimitedQueue { return a.broadcasts } +// Returns this agent IP address and port func (a *Agent) GetAddrPort() (net.IP, uint16) { - n := a.memberlist.LocalNode() + n := a.gossip.LocalNode() return n.Addr, n.Port } -func (a *Agent) State() member.Status { +// Returns this agent status. This can be used to +// check if we should stop doing something based +// on the state of the agent in the gossip network. +func (a *Agent) State() Status { a.stateLock.Lock() defer a.stateLock.Unlock() return a.Self.Status diff --git a/gossip/agent_test.go b/gossip/agent_test.go index 9641cab61..dc2bc3025 100644 --- a/gossip/agent_test.go +++ b/gossip/agent_test.go @@ -21,39 +21,37 @@ import ( "testing" "github.com/stretchr/testify/require" - - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/metrics" ) func TestJoin(t *testing.T) { conf := DefaultConfig() conf.NodeName = "testNode" - conf.Role = member.Auditor + conf.Role = "auditor" conf.BindAddr = "127.0.0.1:12345" - metricsServer := metrics.NewServer("127.0.0.2:23464") - a, _ := NewAgent(conf, []Processor{FakeProcessor{}}, metricsServer) + + a, _ := NewAgentFromConfig(conf) + a.Start() testCases := []struct { - agentState member.Status + agentState Status addrs []string expectedContactedHosts int expectedErr error }{ { - member.Alive, + AgentStatusAlive, []string{}, 0, nil, }, { - member.Failed, + AgentStatusFailed, []string{}, 0, fmt.Errorf("Agent can't join after Leave or Shutdown"), }, { - member.Alive, + AgentStatusAlive, []string{"127.0.0.1:12345"}, 1, nil, @@ -66,45 +64,46 @@ func TestJoin(t *testing.T) { require.Equal(t, c.expectedContactedHosts, result, "Wrong expected contacted hosts in test %d.", i) require.Equal(t, c.expectedErr, err, "Wrong expected error in test %d.", i) } + a.Shutdown() } func TestLeave(t *testing.T) { conf := DefaultConfig() conf.NodeName = "testNode" - conf.Role = member.Auditor + conf.Role = "auditor" conf.BindAddr = "127.0.0.1:12346" - metricsServer := metrics.NewServer("127.0.0.2:13445") - a, _ := NewAgent(conf, []Processor{FakeProcessor{}}, metricsServer) + a, _ := NewAgentFromConfig(conf) + a.Start() testCases := []struct { - agentState member.Status + agentState Status expectedErr error - finalStatus member.Status + finalStatus Status }{ { - member.Left, + AgentStatusLeft, nil, - member.Left, + AgentStatusLeft, }, { - member.Leaving, + AgentStatusLeaving, fmt.Errorf("Leave already in progress"), - member.Leaving, + AgentStatusLeaving, }, { - member.Shutdown, + AgentStatusShutdown, fmt.Errorf("Leave called after Shutdown"), - member.Shutdown, + AgentStatusShutdown, }, { - member.Alive, + AgentStatusAlive, nil, - member.Left, + AgentStatusLeft, }, { - member.Failed, + AgentStatusFailed, nil, - member.Left, + AgentStatusLeft, }, } @@ -114,53 +113,5 @@ func TestLeave(t *testing.T) { require.Equal(t, c.expectedErr, err, "Wrong expected error in test %d.", i) require.Equal(t, c.finalStatus, a.Self.Status, "Wrong expected status in test %d.", i) } -} - -func TestShutdown(t *testing.T) { - - conf := DefaultConfig() - conf.NodeName = "testNode" - conf.Role = member.Auditor - conf.BindAddr = "127.0.0.1:12347" - metricsServer := metrics.NewServer("127.0.0.2:43512") - a, _ := NewAgent(conf, []Processor{FakeProcessor{}}, metricsServer) - - testCases := []struct { - agentState member.Status - expectedErr error - finalStatus member.Status - }{ - { - member.Shutdown, - nil, - member.Shutdown, - }, - { - member.Left, - nil, - member.Shutdown, - }, - { - member.Alive, - nil, - member.Shutdown, - }, - { - member.Failed, - nil, - member.Shutdown, - }, - { - member.Leaving, - nil, - member.Shutdown, - }, - } - - for i, c := range testCases { - a.Self.Status = c.agentState - err := a.Shutdown() - require.Equal(t, c.expectedErr, err, "Wrong expected error in test %d.", i) - require.Equal(t, c.finalStatus, a.Self.Status, "Wrong expected status in test %d.", i) - } + a.Shutdown() } diff --git a/gossip/auditor/auditor.go b/gossip/auditor/auditor.go deleted file mode 100644 index 0da99b067..000000000 --- a/gossip/auditor/auditor.go +++ /dev/null @@ -1,280 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package auditor - -import ( - "bytes" - "crypto/tls" - "fmt" - "io" - "io/ioutil" - "net/http" - "time" - - "github.com/bbva/qed/client" - "github.com/bbva/qed/hashing" - "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" - "github.com/bbva/qed/protocol" - "github.com/pkg/errors" - - "github.com/prometheus/client_golang/prometheus" -) - -var ( - QedAuditorInstancesCount = prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "qed_auditor_instances_count", - Help: "Number of auditor agents running.", - }, - ) - - QedAuditorBatchesProcessSeconds = prometheus.NewSummary( - prometheus.SummaryOpts{ - Name: "qed_auditor_batches_process_seconds", - Help: "Duration of Auditor batch processing", - }, - ) - - QedAuditorBatchesReceivedTotal = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "qed_auditor_batches_received_total", - Help: "Number of batches received by auditors.", - }, - ) - - QedAuditorGetMembershipProofErrTotal = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "qed_auditor_get_membership_proof_err_total", - Help: "Number of errors trying to get membership proofs by auditors.", - }, - ) -) - -type Config struct { - QEDUrls []string - PubUrls []string - AlertsUrls []string - APIKey string - TaskExecutionInterval time.Duration - MaxInFlightTasks int - MetricsAddr string -} - -func DefaultConfig() *Config { - return &Config{ - TaskExecutionInterval: 200 * time.Millisecond, - MaxInFlightTasks: 10, - } -} - -type Auditor struct { - qed *client.HTTPClient - conf Config - - taskCh chan Task - quitCh chan bool - executionTicker *time.Ticker -} - -type Task interface { - Do() -} - -func NewAuditor(conf Config) (*Auditor, error) { - QedAuditorInstancesCount.Inc() - // QED client - transport := http.DefaultTransport.(*http.Transport) - transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: false} - httpClient := http.DefaultClient - httpClient.Transport = transport - qed, err := client.NewHTTPClient( - client.SetHttpClient(httpClient), - client.SetURLs(conf.QEDUrls[0], conf.QEDUrls[1:]...), - client.SetAPIKey(conf.APIKey), - client.SetReadPreference(client.Any), - client.SetAttemptToReviveEndpoints(true), - ) - if err != nil { - return nil, errors.Wrap(err, "Cannot start http client: ") - } - - auditor := Auditor{ - qed: qed, - conf: conf, - taskCh: make(chan Task, 100), - quitCh: make(chan bool), - } - - auditor.executionTicker = time.NewTicker(conf.TaskExecutionInterval) - go auditor.runTaskDispatcher() - - return &auditor, nil -} - -func (a Auditor) RegisterMetrics(srv *metrics.Server) { - metrics := []prometheus.Collector{ - QedAuditorInstancesCount, - QedAuditorBatchesProcessSeconds, - QedAuditorBatchesReceivedTotal, - QedAuditorGetMembershipProofErrTotal, - } - srv.MustRegister(metrics...) -} - -func (a Auditor) runTaskDispatcher() { - for { - select { - case <-a.executionTicker.C: - go a.dispatchTasks() - case <-a.quitCh: - a.executionTicker.Stop() - return - } - } -} - -func (a Auditor) dispatchTasks() { - count := 0 - var task Task - - for { - select { - case task = <-a.taskCh: - go task.Do() - count++ - default: - return - } - if count >= a.conf.MaxInFlightTasks { - return - } - } -} - -func (a Auditor) Process(b *protocol.BatchSnapshots) { - QedAuditorBatchesReceivedTotal.Inc() - timer := prometheus.NewTimer(QedAuditorBatchesProcessSeconds) - defer timer.ObserveDuration() - - task := &MembershipTask{ - qed: a.qed, - pubUrl: a.conf.PubUrls[0], - alertsUrl: a.conf.AlertsUrls[0], - taskCh: a.taskCh, - retries: 2, - s: b.Snapshots[0], - } - - a.taskCh <- task -} - -func (a *Auditor) Shutdown() { - QedAuditorInstancesCount.Dec() - a.executionTicker.Stop() - a.quitCh <- true - close(a.quitCh) - close(a.taskCh) - log.Debugf("Auditor stopped.") -} - -type MembershipTask struct { - qed *client.HTTPClient - pubUrl string - alertsUrl string - taskCh chan Task - retries int - s *protocol.SignedSnapshot -} - -func (t *MembershipTask) Do() { - - proof, err := t.qed.MembershipDigest(t.s.Snapshot.EventDigest, t.s.Snapshot.Version) - if err != nil { - log.Infof("Auditor is unable to get membership proof from QED server: %s", err.Error()) - - switch fmt.Sprintf("%T", err) { - case "*errors.errorString": - t.sendAlert(fmt.Sprintf("Auditor is unable to get membership proof from QED server: %s", err.Error())) - default: - QedAuditorGetMembershipProofErrTotal.Inc() - } - - return - } - - snap, err := t.getSnapshot(proof.CurrentVersion) - if err != nil { - log.Infof("Unable to get snapshot from storage: %v", err) - if t.retries > 0 { - log.Infof("Enqueue another try to grt snapshot from storage") - t.retries -= 1 - t.taskCh <- t - } - return - } - - checkSnap := &protocol.Snapshot{ - HistoryDigest: t.s.Snapshot.HistoryDigest, - HyperDigest: snap.Snapshot.HyperDigest, - Version: t.s.Snapshot.Version, - EventDigest: t.s.Snapshot.EventDigest, - } - - ok := t.qed.DigestVerify(proof, checkSnap, hashing.NewSha256Hasher) - if !ok { - t.sendAlert(fmt.Sprintf("Unable to verify snapshot %v", t.s.Snapshot)) - log.Infof("Unable to verify snapshot %v", t.s.Snapshot) - } - - log.Infof("MembershipTask.Do(): Snapshot %v has been verified by QED", t.s.Snapshot) -} - -func (t MembershipTask) getSnapshot(version uint64) (*protocol.SignedSnapshot, error) { - resp, err := http.Get(fmt.Sprintf("%s/snapshot?v=%d", t.pubUrl, version)) - if err != nil { - return nil, fmt.Errorf("Error getting snapshot from the store: %v", err) - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("Error getting snapshot from the store. Status: %d", resp.StatusCode) - } - buf, err := ioutil.ReadAll(resp.Body) - if err != nil { - log.Infof("Error reading request body: %v", err) - } - var s protocol.SignedSnapshot - err = s.Decode(buf) - if err != nil { - return nil, fmt.Errorf("Error decoding signed snapshot %d codec", t.s.Snapshot.Version) - } - return &s, nil -} - -func (t MembershipTask) sendAlert(msg string) { - resp, err := http.Post(t.alertsUrl+"/alert", "application/json", - bytes.NewBufferString(msg)) - if err != nil { - log.Infof("Error saving batch in alertStore: %v", err) - return - } - defer resp.Body.Close() - _, err = io.Copy(ioutil.Discard, resp.Body) - if err != nil { - log.Infof("Error reading request body: %v", err) - } -} diff --git a/gossip/bus.go b/gossip/bus.go new file mode 100644 index 000000000..0e593e177 --- /dev/null +++ b/gossip/bus.go @@ -0,0 +1,144 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "sync" + + "github.com/bbva/qed/log" +) + +// A Subscriber to agent gossip message queues +// is like: +// func (my *BatchProcessor) Subscribe(c chan *Message) MessageType { +// my.inCh = c +// return BatchMessageType +// } +// and receives a channel to read publised messages from +// the selected MessageType +type Subscriber interface { + Subscribe(id int, ch <-chan *Message) +} + +// A producers fills the chan *Message with the +// gossip messages to be consumed by a subscriber +type Producer interface { + Produce(ch chan<- *Message) +} + +type Subscribers []chan *Message + +// Implements a subscriber / publisher model for +// gossip Messages +type MessageBus struct { + pool [MAXMESSAGEID]Subscribers + rm sync.RWMutex +} + +// Publish a message to all the subscribers of its MessageType. +// +// If there is no subscriber the message will not be sent and will be lost. +// +// All the subscribers will get all the messages. If a subscriber is busy +// it will block delivery to the next subscribers. Also +// publish will create a goroutine per message sent, and +// will not time out. +// +func (eb *MessageBus) Publish(msg *Message) error { + eb.rm.RLock() + defer eb.rm.RUnlock() + if chans := eb.pool[msg.Kind]; len(chans) > 0 { + log.Debugf("Agent message bus publising message to %d subscribers", len(chans)) + channels := append(chans[:0:0], chans...) + go func(msg *Message, subscribers Subscribers) { + for _, s := range subscribers { + s <- msg + } + }(msg, channels) + return nil + } + log.Infof("Agent message bus publising message: no subscribers for message kind %d ", msg.Kind) + return NoSubscribersFound +} + +// Subscribe add a subscriber to the its correspondant pool. +// Returns the subscription id needed for unsubscribe +func (eb *MessageBus) Subscribe(t MessageType, s Subscriber, size int) { + eb.rm.Lock() + defer eb.rm.Unlock() + ch := make(chan *Message, size) + + if eb.pool[t] != nil { + eb.pool[t] = append(eb.pool[t], ch) + } else { + eb.pool[t] = append(Subscribers{}, ch) + } + + s.Subscribe(len(eb.pool[t]), ch) +} + +// Unsubscribe a subscriber by its id +func (eb *MessageBus) Unsubscribe(t MessageType, id int) { + eb.rm.Lock() + eb.pool[t] = append(eb.pool[t][:id], eb.pool[t][id+1:]...) + eb.rm.Unlock() +} + +// Implements a message queue in which +// subscribers consumes producers +// Messages. +// +// There is a queue for each kind of message, +// and all the producers and subscribers +// will operate over the same chan *Message. +// +// This pattern allows a pool of subscribers to +// consume messages from a pool of producers +// without blocking. +type MessageQueue struct { + size int + queue [MAXMESSAGEID]chan *Message + rm sync.RWMutex +} + +// Register a producer to the MessageType queue +func (mq *MessageQueue) Producer(t MessageType, p Producer) { + mq.rm.RLock() + defer mq.rm.RUnlock() + + if mq.queue[t] == nil { + mq.queue[t] = make(chan *Message, mq.size) + } + p.Produce(mq.queue[t]) +} + +// Register a consumer to the MessageType queue +func (mq *MessageQueue) Consumer(t MessageType, s Subscriber) { + mq.rm.RLock() + defer mq.rm.RUnlock() + if mq.queue[t] == nil { + mq.queue[t] = make(chan *Message, mq.size) + } + s.Subscribe(0, mq.queue[t]) +} + +// Cancels signals all producers and consumers to stop +// closing the internal channel +func (mq *MessageQueue) Cancel(t MessageType) { + mq.rm.RLock() + defer mq.rm.RUnlock() + close(mq.queue[t]) +} diff --git a/gossip/bus_test.go b/gossip/bus_test.go new file mode 100644 index 000000000..d94a55697 --- /dev/null +++ b/gossip/bus_test.go @@ -0,0 +1,77 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +type testSubscriber struct { + ch <-chan *Message + id int +} + +func (ts *testSubscriber) Subscribe(id int, ch <-chan *Message) { + ts.ch = ch + ts.id = id +} + +func TestMessageBus(t *testing.T) { + var mb MessageBus + var ts testSubscriber + m1 := &Message{ + Kind: BatchMessageType, + From: nil, + TTL: 0, + Payload: nil, + } + mb.Subscribe(BatchMessageType, &ts, 1) + mb.Publish(m1) + m2 := <-ts.ch + require.Equal(t, m2, m1, "Messages should match") +} + +type testProducer struct { + ch chan<- *Message +} + +func (tp *testProducer) Produce(ch chan<- *Message) { + tp.ch = ch +} + +func TestMessageQueue(t *testing.T) { + var mq MessageQueue + var ts testSubscriber + var tp testProducer + var m2 *Message + + m1 := &Message{ + Kind: BatchMessageType, + From: nil, + TTL: 0, + Payload: nil, + } + mq.Consumer(BatchMessageType, &ts) + mq.Producer(BatchMessageType, &tp) + + go func() { m2 = <-ts.ch }() + tp.ch <- m1 + + require.Equal(t, m2, m1, "Messages should match") +} diff --git a/gossip/member/status.go b/gossip/cache.go similarity index 64% rename from gossip/member/status.go rename to gossip/cache.go index 33047fe75..9977f56d4 100644 --- a/gossip/member/status.go +++ b/gossip/cache.go @@ -13,30 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package member -// Status is the state of the Agent instance. -type Status int32 +package gossip -const ( - Alive Status = iota - Leaving - Left - Shutdown - Failed -) - -func (s Status) String() string { - switch s { - case Alive: - return "alive" - case Leaving: - return "leaving" - case Left: - return "left" - case Shutdown: - return "shutdown" - default: - return "failed" - } +// Defines the methods required for a cache implementation +// to be used by the agent and its processors +type Cache interface { + Get(key []byte) (value []byte, err error) + Set(key []byte, value []byte, expireSeconds int) (err error) } diff --git a/gossip/config.go b/gossip/config.go index a0bec2432..df186132b 100644 --- a/gossip/config.go +++ b/gossip/config.go @@ -19,7 +19,6 @@ import ( "net" "time" - "github.com/bbva/qed/gossip/member" "github.com/hashicorp/memberlist" ) @@ -37,53 +36,61 @@ func DefaultConfig() *Config { LeavePropagateDelay: 0, TimeoutQueues: 200 * time.Millisecond, ProcessInterval: 1 * time.Second, + CacheSize: 1 << 20, + MaxSenders: 10, } } // Config is the configuration for creating an Agent instance type Config struct { + Log string `desc:"Set log level to info, error or debug"` + // The name of this node. This must be unique in the cluster. If this // is not set, Auditor will set it to the hostname of the running machine. - NodeName string + NodeName string `desc:"Set gossip name for this agent"` - Role member.Type + Role string `desc:"Set gossip role for this agent routing"` // BindAddr is the address that the Auditor agent's communication ports // will bind to. Auditor will use this address to bind to for both TCP // and UDP connections. If no port is present in the address, the default // port will be used. - BindAddr string + BindAddr string `desc:"Address ip:port to expose gossip protocol"` - // AdvertiseAddr is the address that the Auditor agent will advertise to + // AdvertiseAddr is the address agent will advertise to // other members of the cluster. Can be used for basic NAT traversal // where both the internal ip:port and external ip:port are known. - AdvertiseAddr string + AdvertiseAddr string `desc:"Address ip:port to advertise in gossip if our bind addr is not reachable from other agents"` + + // MetricsAddr is the address where the metrics server will expose its + // API to enable mterics collectors retrieve them + MetricsAddr string `desc:"Address ip:port to expose metrics"` - // LeaveOnTerm controls if the Auditor does a graceful leave when receiving + // LeaveOnTerm controls if the agent does a graceful leave when receiving // the TERM signal. Defaults false. This can be changed on reload. - LeaveOnTerm bool + LeaveOnTerm bool `desc:"Controls if the agent does a graceful leave when receiving the TERM signal"` // StartJoin is a list of addresses to attempt to join when the // agent starts. If the agent is unable to communicate with any of these // addresses, then the agent will error and exit. - StartJoin []string + StartJoin []string `desc:"Address list ip1:port1,ip2:port2... to join other agents and form a gossip network"` // EnableCompression specifies whether message compression is enabled // by `github.com/hashicorp/memberlist` when broadcasting events. - EnableCompression bool + EnableCompression bool `desc:"Specifies whether message compression is enabled when broadcasting events"` // BroadcastTimeout is the amount of time to wait for a broadcast // message to be sent to the cluster. Broadcast messages are used for // things like leave messages and force remove messages. If this is not // set, a timeout of 5 seconds will be set. - BroadcastTimeout time.Duration + BroadcastTimeout time.Duration `desc:"The amount of time to wait for a broadcast message to be sent to the cluster"` // LeavePropagateDelay is for our leave (node dead) message to propagate // through the cluster. In particular, we want to stay up long enough to // service any probes from other nodes before they learn about us // leaving and stop probing. Otherwise, we risk getting node failures as // we leave. - LeavePropagateDelay time.Duration + LeavePropagateDelay time.Duration `desc:"Time for our leave (node dead) message to propagate through the cluster"` // MemberlistConfig is the memberlist configuration that Agent will // use to do the underlying membership management and gossip. Some @@ -97,19 +104,20 @@ type Config struct { // // * Delegate - Auditor uses a custom delegate. // - MemberlistConfig *memberlist.Config - - // Comma-delimited list of Alert servers ([host]:port), through which an agent can post alerts - AlertsUrls []string + MemberlistConfig *memberlist.Config `flag:"-"` // Timeout enqueuing elements on a channel - TimeoutQueues time.Duration + TimeoutQueues time.Duration `desc:"Timeout enqueuing elements on a channel"` // Interval to send out messages to other agents - ProcessInterval time.Duration - - // Address to bind the metrics endpoint - MetricsAddr string + ProcessInterval time.Duration `desc:"Interval to send out messages to other agents"` + + // Maximum number of concurrent senders + MaxSenders int `desc:"Maximum number of concurrent senders"` + + // Cache size in bytes to store agent temporal objects. + // This cache will evict old objects by default + CacheSize int `desc:"Cache size in bytes to store agent temporal objects"` } // AddrParts returns the parts of the BindAddr that should be diff --git a/gossip/delegate.go b/gossip/delegate.go index c63c44a65..bdfcbcc10 100644 --- a/gossip/delegate.go +++ b/gossip/delegate.go @@ -16,12 +16,7 @@ package gossip import ( - "encoding/json" - - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/hashing" "github.com/bbva/qed/log" - "github.com/bbva/qed/protocol" "github.com/hashicorp/memberlist" ) @@ -35,16 +30,16 @@ type eventDelegate struct { // NotifyJoin is invoked when a node is detected to have joined. func (e *eventDelegate) NotifyJoin(n *memberlist.Node) { - peer := member.ParsePeer(n) - peer.Status = member.Alive - e.agent.Topology.Update(peer) + peer := ParsePeer(n) + peer.Status = AgentStatusAlive + e.agent.topology.Update(peer) log.Debugf("member joined: %+v ", peer) } // NotifyLeave is invoked when a node is detected to have left. func (e *eventDelegate) NotifyLeave(n *memberlist.Node) { - peer := member.ParsePeer(n) - e.agent.Topology.Delete(peer) + peer := ParsePeer(n) + e.agent.topology.Delete(peer) log.Debugf("member left: %+v", peer) } @@ -52,8 +47,8 @@ func (e *eventDelegate) NotifyLeave(n *memberlist.Node) { // updated, usually involving the meta data. func (e *eventDelegate) NotifyUpdate(n *memberlist.Node) { // ignore - peer := member.ParsePeer(n) - e.agent.Topology.Update(peer) + peer := ParsePeer(n) + e.agent.topology.Update(peer) log.Debugf("member updated: %+v ", peer) } @@ -83,25 +78,12 @@ func (d *agentDelegate) NodeMeta(limit int) []byte { // so would block the entire UDP packet receive loop. Additionally, the byte // slice may be modified after the call returns, so it should be copied if needed func (d *agentDelegate) NotifyMsg(msg []byte) { - var batch protocol.BatchSnapshots - - var tmp map[string]*json.RawMessage - err := json.Unmarshal(msg, &tmp) - if err != nil { - log.Errorf("Unable to decode message: %v", err) - return - } - - err = batch.Decode(msg) + m := &Message{} + err := m.Decode(msg) if err != nil { - log.Errorf("Unable to decode message: %v", err) - return + log.Infof("Agent Deletage unable to decode gossip message!: %v", err) } - - // hashs the snaapshots to deduplicate processing inside the agent - hash := hashing.NewSha256Hasher().Do(*tmp["Snapshots"]) - log.Debugf("Notifying batch %v\n", hash) - d.agent.In <- &hashedBatch{&batch, hash} + d.agent.In.Publish(m) } // GetBroadcasts is called when user data messages can be broadcast. diff --git a/gossip/errors.go b/gossip/errors.go new file mode 100644 index 000000000..8540e6a13 --- /dev/null +++ b/gossip/errors.go @@ -0,0 +1,21 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import "errors" + +var ChTimedOut error = errors.New("Timeout sending data to channel") +var NoSubscribersFound error = errors.New("No subscribers found") diff --git a/gossip/member/member.go b/gossip/member/member.go deleted file mode 100644 index 492adfbd2..000000000 --- a/gossip/member/member.go +++ /dev/null @@ -1,105 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package member - -import ( - "net" - - "github.com/bbva/qed/log" - "github.com/hashicorp/memberlist" -) - -type Type int - -func (t Type) String() string { - switch t { - case Auditor: - return "auditor" - case Monitor: - return "monitor" - case Publisher: - return "publisher" - case Server: - return "server" - default: - return "unknown" - } -} - -func ParseType(value string) Type { - switch value { - case "auditor": - return Auditor - case "monitor": - return Monitor - case "publisher": - return Publisher - default: - return Server - } -} - -const ( - Auditor Type = iota - Monitor - Publisher - Server - Unknown -) - -// Member is a single member of the gossip cluster. -type Peer struct { - Name string - Addr net.IP - Port uint16 - Meta Meta - Status Status -} - -func (p Peer) Node() *memberlist.Node { - return &memberlist.Node{ - Name: p.Name, - Addr: p.Addr, - Port: p.Port, - } -} - -func NewPeer(name, addr string, port uint16, role Type) *Peer { - meta := Meta{ - Role: role, - } - - return &Peer{ - Name: name, - Addr: net.ParseIP(addr), - Port: port, - Meta: meta, - } -} - -func ParsePeer(node *memberlist.Node) *Peer { - var meta Meta - err := meta.Decode(node.Meta) - if err != nil { - log.Errorf("Error parsing peer: unable to decode meta. %v", err) - } - return &Peer{ - Name: node.Name, - Addr: node.Addr, - Port: node.Port, - Meta: meta, - } -} diff --git a/gossip/messages.go b/gossip/messages.go new file mode 100644 index 000000000..fd018298a --- /dev/null +++ b/gossip/messages.go @@ -0,0 +1,66 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "bytes" + + "github.com/hashicorp/go-msgpack/codec" +) + +// msgpackHandle is a shared handle for encoding/decoding of structs +var msgpackHandle = &codec.MsgpackHandle{} + +const ( + MAXMESSAGEID = 1 << 8 +) + +type MessageType uint8 + +const ( + BatchMessageType MessageType = iota // Contains a protocol.BatchSnapshots +) + +// Gossip message code. Up to 255 different messages. +type Message struct { + Kind MessageType + From *Peer + TTL int + Payload []byte +} + +/* +func (m *Message) Encode() ([]byte, error) { + return json.Marshal(m) +} + +func (m *Message) Decode(msg []byte) error { + err := json.Unmarshal(msg, m) + return err +} +*/ + +func (m *Message) Encode() ([]byte, error) { + var buf bytes.Buffer + err := codec.NewEncoder(&buf, msgpackHandle).Encode(m) + return buf.Bytes(), err +} + +func (m *Message) Decode(buf []byte) error { + return codec.NewDecoder(bytes.NewReader(buf), msgpackHandle).Decode(m) +} + diff --git a/cmd/context.go b/gossip/messages_test.go similarity index 52% rename from cmd/context.go rename to gossip/messages_test.go index 67f34f2a5..cb4d800f3 100644 --- a/cmd/context.go +++ b/gossip/messages_test.go @@ -14,36 +14,26 @@ limitations under the License. */ -package cmd +package gossip import ( - "github.com/bbva/qed/client" - "github.com/bbva/qed/gossip" - "github.com/bbva/qed/log" -) - -type cmdContext struct { - apiKey, logLevel, configFile, path string - disableConfig, profiling bool -} - -type clientContext struct { - config *client.Config - client *client.HTTPClient -} + "testing" -type agentContext struct { - config *gossip.Config -} + "github.com/stretchr/testify/require" +) -func markStringRequired(value, name string) { - if value == "" { - log.Fatalf("Argument `%s` is required", name) +func TestMessageEncodeDecode(t *testing.T) { + var m2 Message + m1 := &Message{ + Kind: BatchMessageType, + From: nil, + TTL: 0, + Payload: nil, } -} -func markSliceStringRequired(value []string, name string) { - if len(value) == 0 { - log.Fatalf("Argument `%s` is required", name) - } + buff, err := m1.Encode() + require.NoError(t, err, "Encoding must end succesfully") + m2.Decode(buff) + require.Equal(t, &m2, m1, "Messages must be equal") + } diff --git a/gossip/member/meta.go b/gossip/meta.go similarity index 96% rename from gossip/member/meta.go rename to gossip/meta.go index 76df4bc9b..6b8cbfe36 100644 --- a/gossip/member/meta.go +++ b/gossip/meta.go @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package member +package gossip import ( "bytes" @@ -22,8 +22,9 @@ import ( "github.com/hashicorp/go-msgpack/codec" ) +// Agent metadata type Meta struct { - Role Type + Role string } func (a *Meta) Encode() ([]byte, error) { diff --git a/gossip/meta_test.go b/gossip/meta_test.go new file mode 100644 index 000000000..de875d6e2 --- /dev/null +++ b/gossip/meta_test.go @@ -0,0 +1,35 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestEncodeDecode(t *testing.T) { + var m1, m2 Meta + + m1.Role = "string test" + + buff, err := m1.Encode() + require.NoError(t, err, "Error encoding metadata") + err = m2.Decode(buff) + require.NoError(t, err, "Error decoding metadata") + require.Equal(t, m1, m2, "Both metadata must be equals") +} diff --git a/gossip/monitor/monitor.go b/gossip/monitor/monitor.go deleted file mode 100644 index 185731ff5..000000000 --- a/gossip/monitor/monitor.go +++ /dev/null @@ -1,238 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package monitor - -import ( - "bytes" - "crypto/tls" - "fmt" - "io" - "io/ioutil" - "net/http" - "time" - - "github.com/bbva/qed/client" - "github.com/bbva/qed/hashing" - "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" - "github.com/bbva/qed/protocol" - "github.com/pkg/errors" - - "github.com/prometheus/client_golang/prometheus" -) - -var ( - QedMonitorInstancesCount = prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "qed_monitor_instances_count", - Help: "Number of monitor agents running.", - }, - ) - - QedMonitorBatchesReceivedTotal = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "qed_monitor_batches_received_total", - Help: "Number of batches received by monitors.", - }, - ) - - QedMonitorBatchesProcessSeconds = prometheus.NewSummary( - prometheus.SummaryOpts{ - Name: "qed_monitor_batches_process_seconds", - Help: "Duration of Monitor batch processing", - }, - ) - - QedMonitorGetIncrementalProofErrTotal = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "qed_monitor_get_incremental_proof_err_total", - Help: "Number of errors trying to get incremental proofs by monitors.", - }, - ) -) - -type Config struct { - QEDUrls []string - AlertsUrls []string - APIKey string - TaskExecutionInterval time.Duration - MaxInFlightTasks int - MetricsAddr string -} - -func DefaultConfig() *Config { - return &Config{ - TaskExecutionInterval: 200 * time.Millisecond, - MaxInFlightTasks: 10, - } -} - -type Monitor struct { - client *client.HTTPClient - conf *Config - taskCh chan Task - quitCh chan bool - executionTicker *time.Ticker -} - -type Task interface { - Do() -} - -func NewMonitor(conf *Config) (*Monitor, error) { - QedMonitorInstancesCount.Inc() - // QED client - transport := http.DefaultTransport.(*http.Transport) - transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: false} - httpClient := http.DefaultClient - httpClient.Transport = transport - qed, err := client.NewHTTPClient( - client.SetHttpClient(httpClient), - client.SetURLs(conf.QEDUrls[0], conf.QEDUrls[1:]...), - client.SetAPIKey(conf.APIKey), - client.SetReadPreference(client.Any), - client.SetAttemptToReviveEndpoints(true), - ) - if err != nil { - return nil, errors.Wrap(err, "Cannot start http client: ") - } - monitor := Monitor{ - client: qed, - conf: conf, - taskCh: make(chan Task, 100), - quitCh: make(chan bool), - } - - monitor.executionTicker = time.NewTicker(conf.TaskExecutionInterval) - go monitor.runTaskDispatcher() - - return &monitor, nil -} - -func (m Monitor) RegisterMetrics(srv *metrics.Server) { - metrics := []prometheus.Collector{ - QedMonitorInstancesCount, - QedMonitorBatchesReceivedTotal, - QedMonitorBatchesProcessSeconds, - QedMonitorGetIncrementalProofErrTotal, - } - srv.MustRegister(metrics...) -} - -func (m Monitor) Process(b *protocol.BatchSnapshots) { - QedMonitorBatchesReceivedTotal.Inc() - timer := prometheus.NewTimer(QedMonitorBatchesProcessSeconds) - defer timer.ObserveDuration() - - first := b.Snapshots[0].Snapshot - last := b.Snapshots[len(b.Snapshots)-1].Snapshot - - log.Debugf("Monitor processing batch from versions %d to %d", first.Version, last.Version) - - task := QueryTask{ - client: m.client, - alertsUrl: m.conf.AlertsUrls[0], - Start: first.Version, - End: last.Version, - StartSnapshot: *first, - EndSnapshot: *last, - } - - m.taskCh <- task -} - -func (m Monitor) runTaskDispatcher() { - for { - select { - case <-m.executionTicker.C: - go m.dispatchTasks() - case <-m.quitCh: - m.executionTicker.Stop() - return - } - } -} - -func (m *Monitor) Shutdown() { - QedMonitorInstancesCount.Dec() - - m.executionTicker.Stop() - m.quitCh <- true - close(m.quitCh) - close(m.taskCh) - log.Debugf("Monitor stopped.") -} - -func (m Monitor) dispatchTasks() { - count := 0 - var task Task - var ok bool - - for { - select { - case task, ok = <-m.taskCh: - if !ok { - return - } - go task.Do() - count++ - default: - return - } - if count >= m.conf.MaxInFlightTasks { - return - } - } -} - -type QueryTask struct { - client *client.HTTPClient - alertsUrl string - taskCh chan Task - Start, End uint64 - StartSnapshot, EndSnapshot protocol.Snapshot -} - -func (q QueryTask) sendAlert(msg string) { - resp, err := http.Post(q.alertsUrl+"/alert", "application/json", bytes.NewBufferString(msg)) - if err != nil { - log.Infof("Monitor had an error saving batch in alertStore (task re-enqueued): %v", err) - q.taskCh <- q - return - } - defer resp.Body.Close() - _, err = io.Copy(ioutil.Discard, resp.Body) - if err != nil { - log.Infof("Monitor had an error from alertStore saving a batch: %v", err) - } -} - -func (q QueryTask) Do() { - log.Debugf("Executing task: %+v", q) - resp, err := q.client.Incremental(q.Start, q.End) - if err != nil { - QedMonitorGetIncrementalProofErrTotal.Inc() - log.Infof("Monitor is unable to get incremental proof from QED server: %s", err.Error()) - return - } - ok := q.client.VerifyIncremental(resp, &q.StartSnapshot, &q.EndSnapshot, hashing.NewSha256Hasher()) - if !ok { - q.sendAlert(fmt.Sprintf("Monitor is unable to verify incremental proof from %d to %d", q.StartSnapshot.Version, q.EndSnapshot.Version)) - log.Infof("Monitor is unable to verify incremental proof from %d to %d", q.StartSnapshot.Version, q.EndSnapshot.Version) - } - log.Debugf("Monitor verified a consistency proof between versions %d and %d: %v\n", q.Start, q.End, ok) -} diff --git a/gossip/notifier.go b/gossip/notifier.go new file mode 100644 index 000000000..d22e8416a --- /dev/null +++ b/gossip/notifier.go @@ -0,0 +1,146 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "bytes" + "io" + "io/ioutil" + "math/rand" + "net" + "net/http" + "time" + + "github.com/bbva/qed/log" +) + +// Notifies string messages to external services. +// The process of sending the notifications is +// asynchronous, so a start and stop method is +// needed to activate/desactivate the process. +type Notifier interface { + Alert(msg string) error + Start() + Stop() +} + +//SimpleNotifier configuration object used to parse +//cli options and to build the SimpleNotifier instance +type SimpleNotifierConfig struct { + Endpoint []string `desc:"Notification service endpoint list http://ip1:port1/path1,http://ip2:port2/path2... "` + QueueSize int `desc:"Notifications queue size"` + DialTimeout time.Duration `desc:"Timeout dialing the notification service"` + ReadTimeout time.Duration `desc:"Timeout reading the notification service response"` +} + +// Returns the default configuration for the SimpleNotifier +func DefaultSimpleNotifierConfig() *SimpleNotifierConfig { + return &SimpleNotifierConfig{ + QueueSize: 10, + DialTimeout: 200 * time.Millisecond, + ReadTimeout: 200 * time.Millisecond, + } +} + +// Returns a SimpleNotifier pointer configured with configuration c. +func NewSimpleNotifierFromConfig(c *SimpleNotifierConfig) *SimpleNotifier { + return NewSimpleNotifier(c.Endpoint, c.QueueSize, c.DialTimeout, c.ReadTimeout) +} + +// Implements the default notification service +// client using an HTTP API: +// +// This notifier posts the msg contents to +// the specified endpoint. +type SimpleNotifier struct { + client *http.Client + endpoint []string + notifications chan string + quitCh chan bool +} + +// Returns a new default notififier client configured +// to post messages to the endpoint provided. +// To use the default timeouts of 200ms set them to 0: +// queueTimeout is the time to wait for the queue to accept a new message +// dialTimeout is the time to wait for dial to the notifications server +// readTimeout is the time to wait for the notifications server response +func NewSimpleNotifier(endpoint []string, size int, dialTimeout, readTimeout time.Duration) *SimpleNotifier { + d := SimpleNotifier{ + notifications: make(chan string, size), + quitCh: make(chan bool), + endpoint: endpoint, + } + + d.client = &http.Client{ + Transport: &http.Transport{ + Dial: func(netw, addr string) (net.Conn, error) { + // timeout calling the server + conn, err := net.DialTimeout(netw, addr, dialTimeout) + if err != nil { + return nil, err + } + // timeout reading from the connection + conn.SetDeadline(time.Now().Add(readTimeout)) + return conn, nil + }, + }} + + return &d +} + +// Alert enqueue a message into the notifications +// queue to be sent. It will block if the notifications +// queue is full. +func (n *SimpleNotifier) Alert(msg string) error { + n.notifications <- msg + return nil +} + +// Starts a process which send notifications +// to a random url selected from the configuration list of urls. +func (n *SimpleNotifier) Start() { + go func() { + for { + select { + case msg := <-n.notifications: + i := len(n.endpoint) + url := n.endpoint[0] + if i > 1 { + url = n.endpoint[rand.Intn(i)] + } + + resp, err := n.client.Post(url, "application/json", bytes.NewBufferString(msg)) + if err != nil { + log.Infof("Agent had an error sending the alert %v because %v ", msg, err) + continue + } + defer resp.Body.Close() + _, err = io.Copy(ioutil.Discard, resp.Body) + if err != nil { + log.Infof("Agent had the error %v when reading the response from the alert %v ", err, msg) + } + case <-n.quitCh: + return + } + } + }() +} + +// Makes the notifications process to end +func (n *SimpleNotifier) Stop() { + close(n.quitCh) +} diff --git a/gossip/notifier_test.go b/gossip/notifier_test.go new file mode 100644 index 000000000..3976af062 --- /dev/null +++ b/gossip/notifier_test.go @@ -0,0 +1,52 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestDefaultAlert(t *testing.T) { + var called bool + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" { + w.Header().Set("Allow", "POST") + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + called = true + w.WriteHeader(http.StatusNoContent) + })) + + defer server.Close() + conf := DefaultSimpleNotifierConfig() + conf.Endpoint = append(conf.Endpoint, server.URL) + notificator := NewSimpleNotifierFromConfig(conf) + + notificator.Start() + defer notificator.Stop() + + notificator.Alert("test alert") + time.Sleep(1 * time.Second) + + require.True(t, called, "Server must be called from alerter") +} diff --git a/gossip/options.go b/gossip/options.go new file mode 100644 index 000000000..63a5574c6 --- /dev/null +++ b/gossip/options.go @@ -0,0 +1,185 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "time" + + "github.com/bbva/qed/client" + "github.com/bbva/qed/metrics" + "github.com/coocood/freecache" +) + +type AgentOptionF func(*Agent) error + +func configToOptions(conf *Config) ([]AgentOptionF, error) { + var options []AgentOptionF + + if conf == nil { + return nil, nil + } + + options = []AgentOptionF{ + SetNodeName(conf.NodeName), + SetRole(conf.Role), + SetBindAddr(conf.BindAddr), + SetAdvertiseAddr(conf.AdvertiseAddr), + SetLeaveOnTerm(conf.LeaveOnTerm), + SetStartJoin(conf.StartJoin), + SetEnableCompression(conf.EnableCompression), + SetBroadcastTimeout(conf.BroadcastTimeout), + SetLeavePropagateDelay(conf.LeavePropagateDelay), + SetTimeoutQueues(conf.TimeoutQueues), + SetProcessInterval(conf.ProcessInterval), + SetMetricsServer(conf.MetricsAddr), + SetCache(conf.CacheSize), + SetTimeoutQueues(conf.TimeoutQueues), + } + + return options, nil +} + +func SetNodeName(name string) AgentOptionF { + return func(a *Agent) error { + a.config.NodeName = name + return nil + } +} + +func SetRole(role string) AgentOptionF { + return func(a *Agent) error { + a.config.Role = role + return nil + } +} + +func SetBindAddr(addr string) AgentOptionF { + return func(a *Agent) error { + a.config.BindAddr = addr + return nil + } +} + +func SetAdvertiseAddr(addr string) AgentOptionF { + return func(a *Agent) error { + a.config.AdvertiseAddr = addr + return nil + } +} + +func SetLeaveOnTerm(leave bool) AgentOptionF { + return func(a *Agent) error { + a.config.LeaveOnTerm = leave + return nil + } +} + +func SetStartJoin(addrs []string) AgentOptionF { + return func(a *Agent) error { + a.config.StartJoin = addrs + return nil + } +} + +func SetEnableCompression(enabled bool) AgentOptionF { + return func(a *Agent) error { + a.config.EnableCompression = enabled + return nil + } +} + +func SetBroadcastTimeout(timeout time.Duration) AgentOptionF { + return func(a *Agent) error { + a.config.BroadcastTimeout = timeout + return nil + } +} + +func SetLeavePropagateDelay(delay time.Duration) AgentOptionF { + return func(a *Agent) error { + a.config.LeavePropagateDelay = delay + return nil + } +} + +func SetTimeoutQueues(timeout time.Duration) AgentOptionF { + return func(a *Agent) error { + a.timeout = time.NewTicker(timeout) + return nil + } +} + +func SetProcessInterval(interval time.Duration) AgentOptionF { + return func(a *Agent) error { + return nil + } +} + +func SetMetricsServer(addr string) AgentOptionF { + return func(a *Agent) error { + if addr != "" { + a.metrics = metrics.NewServer(addr) + } + return nil + } +} + +func SetTasksManager(tm TasksManager) AgentOptionF { + return func(a *Agent) error { + a.Tasks = tm + return nil + } +} + +func SetQEDClient(qed *client.HTTPClient) AgentOptionF { + return func(a *Agent) error { + a.Qed = qed + return nil + } +} + +func SetSnapshotStore(store SnapshotStore) AgentOptionF { + return func(a *Agent) error { + a.SnapshotStore = store + return nil + } +} + +func SetNotifier(n Notifier) AgentOptionF { + return func(a *Agent) error { + a.Notifier = n + return nil + } +} + +// export GOGC variable to make GC to collect memory +// adecuately if the cache is too big +func SetCache(size int) AgentOptionF { + return func(a *Agent) error { + a.Cache = freecache.NewCache(int(size)) + return nil + } +} + +func SetProcessors(p map[string]Processor) AgentOptionF { + return func(a *Agent) error { + for _, p := range p { + a.RegisterMetrics(p.Metrics()) + } + a.processors = p + return nil + } +} diff --git a/gossip/peer.go b/gossip/peer.go new file mode 100644 index 000000000..6e39e653d --- /dev/null +++ b/gossip/peer.go @@ -0,0 +1,200 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "math/rand" + "net" + + "github.com/bbva/qed/log" + "github.com/hashicorp/memberlist" +) + +// Status is the state of the Agent instance. +type Status int32 + +const ( + AgentStatusAlive Status = iota + AgentStatusLeaving + AgentStatusLeft + AgentStatusShutdown + AgentStatusFailed +) + +func (s Status) String() string { + switch s { + case AgentStatusAlive: + return "alive" + case AgentStatusLeaving: + return "leaving" + case AgentStatusLeft: + return "left" + case AgentStatusShutdown: + return "shutdown" + default: + return "failed" + } +} + +// Member is a single member of the gossip cluster. +type Peer struct { + Name string + Addr net.IP + Port uint16 + Meta Meta + Status Status +} + +// Returns a memberlist node from a peer +// datra +func (p Peer) Node() *memberlist.Node { + return &memberlist.Node{ + Name: p.Name, + Addr: p.Addr, + Port: p.Port, + } +} + +//Returns a new peer from the parameters configuration +func NewPeer(name, addr string, port uint16, role string) *Peer { + meta := Meta{ + Role: role, + } + + return &Peer{ + Name: name, + Addr: net.ParseIP(addr), + Port: port, + Meta: meta, + } +} + +// Builds a new peer from the memberlist.Node data +func ParsePeer(node *memberlist.Node) *Peer { + var meta Meta + err := meta.Decode(node.Meta) + if err != nil { + log.Errorf("Error parsing peer: unable to decode meta. %v", err) + } + return &Peer{ + Name: node.Name, + Addr: node.Addr, + Port: node.Port, + Meta: meta, + } +} + +// Implements a list of peers +// which is able to filter, merge and +// take elements from the head +type PeerList struct { + L []*Peer +} + +func NewPeerList() *PeerList { + return &PeerList{ + L: make([]*Peer, 0), + } +} + +// A filter function returns if a peer must +// me selected or not +type Filter func(m *Peer) bool + +// Returns a filtered peer list, containg only +// the peers the filter selected +func (l *PeerList) Filter(f Filter) *PeerList { + var b PeerList + b.L = make([]*Peer, 0) + for _, x := range l.L { + if f(x) { + b.L = append(b.L, x) + } + } + + return &b +} + +// Appends a peer list to the current list +func (l *PeerList) Append(m *PeerList) { + if m == nil { + return + } + l.L = append(l.L, m.L...) +} + +// Returnsa new list with n peers included +// starting in the head of the list. +func (l *PeerList) Take(n int) *PeerList { + if n > len(l.L) { + return nil + } + + return &PeerList{ + L: l.L[:n], + } +} + +// Returns a list with all the peers from the exclusion +// list removed +func (l *PeerList) Exclude(ex *PeerList) *PeerList { + if ex == nil { + return l + } + return l.Filter(func(p *Peer) bool { + for _, x := range ex.L { + if x.Name == p.Name { + return false + } + } + return true + }) +} + +// Returns the list randomly shuffled +func (l *PeerList) Shuffle() *PeerList { + rand.Shuffle(len(l.L), func(i, j int) { + l.L[i], l.L[j] = l.L[j], l.L[i] + }) + return l +} + +// Updates a peer data by its name +func (l *PeerList) Update(m *Peer) { + for i, e := range l.L { + if e.Name == m.Name { + l.L[i] = m + return + } + } + l.L = append(l.L, m) +} + +// Deletes a peer from the list by its name +func (l *PeerList) Delete(m *Peer) { + for i, e := range l.L { + if e.Name == m.Name { + copy(l.L[i:], l.L[i+1:]) + l.L[len(l.L)-1] = nil + l.L = l.L[:len(l.L)-1] + return + } + } +} + +func (l PeerList) Size() int { + return len(l.L) +} diff --git a/gossip/peer_test.go b/gossip/peer_test.go new file mode 100644 index 000000000..06a15fb73 --- /dev/null +++ b/gossip/peer_test.go @@ -0,0 +1,150 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +var roles = []string{ + "auditor", + "monitor", + "publisher", +} + +func setupPeerList(size int) *PeerList { + peers := make([]*Peer, 0) + for i := 0; i < size; i++ { + name := fmt.Sprintf("name%d", i) + port := uint16(9000 + i) + role := roles[i%len(roles)] + peer := NewPeer(name, "127.0.0.1", port, role) + peers = append(peers, peer) + } + return &PeerList{peers} +} + +func TestFilterPeerList(t *testing.T) { + list := setupPeerList(10) + + filtered := list.Filter(func(m *Peer) bool { + return m.Meta.Role == "auditor" + }) + + require.Truef(t, list.Size() > filtered.Size(), "The filtered list should have less elements") + for _, e := range filtered.L { + require.Truef(t, "auditor" == e.Meta.Role, "The role cannot be different to Auditor") + } +} + +func TestExcludePeerList(t *testing.T) { + list := setupPeerList(10) + + // exclude auditors + filtered := list.Filter(func(m *Peer) bool { + return m.Meta.Role == "auditor" + }) + included := list.Exclude(filtered) + + require.Truef(t, list.Size() > included.Size(), "The included list should have less elements") + for _, e := range included.L { + require.Truef(t, "auditor" != e.Meta.Role, "The role cannot be Auditor") + } +} + +func TestExcludeNonIncludedPeerList(t *testing.T) { + list := setupPeerList(10) + + // exclude unknown + var uknown PeerList + uknown.L = append(uknown.L, NewPeer("uknown", "127.0.0.1", 10000, "unknown")) + included := list.Exclude(&uknown) + + require.Truef(t, list.Size() == included.Size(), "The included list should have the same size") + for _, e := range included.L { + require.Truef(t, "unknown" != e.Meta.Role, "The role cannot be Unknown") + } +} + +func TestSizePeerList(t *testing.T) { + list := setupPeerList(10) + + require.Equalf(t, 10, list.Size(), "The size should match") +} + +func TestUpdatePeerList(t *testing.T) { + var list PeerList + + list.Update(NewPeer("auditor1", "127.0.0.1", 9001, "auditor")) + require.Equalf(t, 1, list.Size(), "The size should have been incremented by 1") + + list.Update(NewPeer("auditor2", "127.0.0.1", 9002, "auditor")) + require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") + + // update the previous one + list.Update(NewPeer("auditor2", "127.0.0.1", 9002, "auditor")) + require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") + + // update the previous one changing status + p := NewPeer("auditor2", "127.0.0.1", 9002, "auditor") + p.Status = AgentStatusLeaving + list.Update(p) + require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") + require.Equalf(t, AgentStatusLeaving, list.L[1].Status, "The status should have been updated") +} + +func TestDeletePeerList(t *testing.T) { + list := setupPeerList(10) + list2 := setupPeerList(10) + + // filter auditor types + auditors := list.Filter(func(m *Peer) bool { + return m.Meta.Role == "auditor" + }) + + // delete auditors + for _, e := range auditors.L { + list2.Delete(e) + } + + require.Truef(t, 10 > list2.Size(), "The new list should have less elements") + for _, e := range list2.L { + require.Truef(t, "auditor" != e.Meta.Role, "The role cannot be Auditor") + } +} + +func TestDeleteNotIncludedPeerList(t *testing.T) { + list := setupPeerList(10) + + list.Delete(NewPeer("unknown", "127.0.0.1", 10000, "unknown")) + + require.Truef(t, 10 == list.Size(), "The new list should have the same size") + +} + +func TestShufflePeerList(t *testing.T) { + list := setupPeerList(10) + + shuffled := list.Shuffle() + + require.Truef(t, 10 == shuffled.Size(), "The new list should have the same size") + for _, e := range list.L { + require.Containsf(t, shuffled.L, e, "The element should remain in the list") + } +} diff --git a/gossip/processor.go b/gossip/processor.go index 2be11922f..bc4774cf1 100644 --- a/gossip/processor.go +++ b/gossip/processor.go @@ -16,43 +16,133 @@ package gossip import ( - "fmt" - "io" - "io/ioutil" - "net/http" + "bytes" + "context" + "github.com/bbva/qed/hashing" "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" "github.com/bbva/qed/protocol" + "github.com/hashicorp/go-msgpack/codec" + "github.com/prometheus/client_golang/prometheus" ) +// A processor mission is to translate from +// and to the gossip network []byte type to +// whatever has semantic sense. +// +// Also it should enqueue tasks in the agent task +// manager. type Processor interface { - Process(*protocol.BatchSnapshots) - RegisterMetrics(*metrics.Server) + Start() + Stop() + Metrics() []prometheus.Collector } -type FakeProcessor struct { +// Reads agents in queue, and generates a +// *protocol.BatchSnapshots queue. +// It also calls the tasks factories and enqueue +// the generated tasks in the agent task manager. +type BatchProcessor struct { + mh *codec.MsgpackHandle + a *Agent + tf []TaskFactory + metrics []prometheus.Collector + quitCh chan bool + ctx context.Context + id int } -func (d FakeProcessor) Process(b *protocol.BatchSnapshots) {} -func (d FakeProcessor) RegisterMetrics(m *metrics.Server) {} +func NewBatchProcessor(a *Agent, tf []TaskFactory) *BatchProcessor { + b := &BatchProcessor{ + mh: &codec.MsgpackHandle{}, + a: a, + tf: tf, + quitCh: make(chan bool), + ctx: context.WithValue(context.Background(), "agent", a), + } + + // register all tasks metrics + for _, t := range tf { + b.metrics = append(b.metrics, t.Metrics()...) + } -type DummyProcessor struct { + return b } -func (d DummyProcessor) RegisterMetrics(m *metrics.Server) {} +func (d *BatchProcessor) Stop() { + close(d.quitCh) +} -func (d DummyProcessor) Process(b *protocol.BatchSnapshots) { - for i := 0; i < len(b.Snapshots); i++ { - res, err := http.Get(fmt.Sprintf("http://127.0.0.1:8888/stat/?nodeType=auditor&id=%d", b.Snapshots[0].Snapshot.Version)) - if err != nil || res == nil { - log.Debugf("Error contacting service with error %v", err) - } - // to reuse connections we need to do this - _, _ = io.Copy(ioutil.Discard, res.Body) - res.Body.Close() +func (d *BatchProcessor) Metrics() []prometheus.Collector { + return d.metrics +} + +// This function requires the cache of the agent to be defined, and will return +// false if the cache is not present in the agent +func (d *BatchProcessor) wasProcessed(b *protocol.BatchSnapshots) bool { + if d.a.Cache == nil { + return false + } - // time.Sleep(1 * time.Second) + var buf bytes.Buffer + err := codec.NewEncoder(&buf, d.mh).Encode(b.Snapshots) + if err != nil { + log.Infof("Error encoding batchsnapshots to calculate its digest. Dropping batch.") + return false } - log.Infof("Processed %v elements of batch id %v\n", len(b.Snapshots), b.Snapshots[0].Snapshot.Version) + bb := buf.Bytes() + digest := hashing.NewSha256Hasher().Do(bb) + // batch already processed, discard it + _, err = d.a.Cache.Get(digest) + if err == nil { + return true + } + d.a.Cache.Set(digest, []byte{0x1}, 0) + return false +} + +func (d *BatchProcessor) Subscribe(id int, ch <-chan *Message) { + d.id = id + + if d.a.metrics != nil { + d.a.metrics.MustRegister(d.metrics...) + } + + go func() { + for { + select { + case msg := <-ch: + // if the message is not a batch, ignore it + if msg.Kind != BatchMessageType { + log.Debugf("BatchProcessor got an unknown message from agent") + continue + } + + batch := new(protocol.BatchSnapshots) + err := batch.Decode(msg.Payload) + if err != nil { + log.Infof("BatchProcessor unable to decode batch!. Dropping message.") + continue + } + + if d.wasProcessed(batch) { + log.Debugf("BatchProcessor got an already processed message from agent") + continue + } + + ctx := context.WithValue(d.ctx, "batch", batch) + for _, t := range d.tf { + log.Debugf("Batch processor creating a new task") + err := d.a.Tasks.Add(t.New(ctx)) + if err != nil { + log.Infof("BatchProcessor was unable to enqueue new task becasue %v", err) + } + } + + d.a.Out.Publish(msg) + case <-d.quitCh: + return + } + } + }() } diff --git a/gossip/processor_test.go b/gossip/processor_test.go new file mode 100644 index 000000000..94f778916 --- /dev/null +++ b/gossip/processor_test.go @@ -0,0 +1,155 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "context" + "io/ioutil" + "net/http" + "strings" + "sync" + "testing" + "time" + + "github.com/bbva/qed/protocol" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" +) + +func TestBatchProcessorLoop(t *testing.T) { + var wg sync.WaitGroup + + ts := &testSubscriber{} + conf := DefaultConfig() + conf.NodeName = "testNode" + conf.Role = "auditor" + conf.BindAddr = "127.0.0.1:12345" + + a, err := NewAgentFromConfig(conf) + require.NoError(t, err, "Error creating agent!") + + p := NewBatchProcessor(a, nil) + a.In.Subscribe(BatchMessageType, p, 1) + defer p.Stop() + + a.Out.Subscribe(BatchMessageType, ts, 5) + batch := &protocol.BatchSnapshots{} + buf, _ := batch.Encode() + m1 := &Message{ + Kind: BatchMessageType, + From: nil, + TTL: 0, + Payload: buf, + } + + wg.Add(1) + go func() { + for { + select { + case m2 := <-ts.ch: + require.Equal(t, m1, m2, "Messages must be equal") + wg.Done() + return + } + } + }() + + a.In.Publish(m1) + + wg.Wait() +} + +func TestBatchProcessorWasProcessed(t *testing.T) { + + ts := &testSubscriber{} + + conf := DefaultConfig() + conf.NodeName = "testNode" + conf.Role = "auditor" + conf.BindAddr = "127.0.0.1:12345" + + a, err := NewAgentFromConfig(conf) + require.NoError(t, err, "Error creating agent!") + + p := NewBatchProcessor(a, nil) + a.In.Subscribe(BatchMessageType, p, 0) + defer p.Stop() + + a.Out.Subscribe(BatchMessageType, ts, 5) + batch := &protocol.BatchSnapshots{} + buf, _ := batch.Encode() + m1 := &Message{ + Kind: BatchMessageType, + From: nil, + TTL: 0, + Payload: buf, + } + + a.In.Publish(m1) + a.In.Publish(m1) + // give time for the scheduler to route all the messages + time.Sleep(1 * time.Second) + + // only one message must be in the output channel as one must be + // dropped by the wasProcessed function + require.Equal(t, 1, len(ts.ch), "Output queue must be 1, duplicate event must be dropped by processor") +} + +type fakeTaskFactory struct{} + +func (f fakeTaskFactory) Metrics() []prometheus.Collector { + return []prometheus.Collector{ + prometheus.NewCounter(prometheus.CounterOpts{Name: "fakeCounterMetric"}), + } +} + +func (f fakeTaskFactory) New(c context.Context) Task { + return func() error { + return nil + } +} + +func TestBatchProcessorRegisterMetrics(t *testing.T) { + + conf := DefaultConfig() + conf.NodeName = "testNode" + conf.Role = "auditor" + conf.BindAddr = "127.0.0.1:12345" + conf.MetricsAddr = "127.0.0.1:12346" + + a, err := NewAgentFromConfig(conf) + require.NoError(t, err, "Error creating agent!") + a.Start() + defer a.Shutdown() + // wait for agent to start + // all services + time.Sleep(3 * time.Second) + + p := NewBatchProcessor(a, []TaskFactory{&fakeTaskFactory{}}) + a.In.Subscribe(BatchMessageType, p, 0) + defer p.Stop() + + resp, err := http.Get("http://" + conf.MetricsAddr + "/metrics") + if err != nil { + panic(err) + } + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + found := strings.Index(string(body), "fakeCounterMetric") + + require.True(t, found > 0, "Metric not found!") +} diff --git a/gossip/publisher/publisher.go b/gossip/publisher/publisher.go deleted file mode 100644 index 021b87aff..000000000 --- a/gossip/publisher/publisher.go +++ /dev/null @@ -1,213 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package publisher - -import ( - "bytes" - "io" - "io/ioutil" - "net/http" - "time" - - "github.com/bbva/qed/log" - "github.com/bbva/qed/metrics" - "github.com/bbva/qed/protocol" - "github.com/coocood/freecache" - - "github.com/prometheus/client_golang/prometheus" -) - -var ( - QedPublisherInstancesCount = prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "qed_publisher_instances_count", - Help: "Number of publisher agents running.", - }, - ) - - QedPublisherBatchesReceivedTotal = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "qed_publisher_batches_received_total", - Help: "Number of batches received by publishers.", - }, - ) - - QedPublisherBatchesProcessSeconds = prometheus.NewSummary( - prometheus.SummaryOpts{ - Name: "qed_publisher_batches_process_seconds", - Help: "Duration of Publisher batch processing", - }, - ) -) - -type Config struct { - PubUrls []string - AlertsUrls []string - TaskExecutionInterval time.Duration - MaxInFlightTasks int - MetricsAddr string -} - -func DefaultConfig() *Config { - return &Config{ - TaskExecutionInterval: 200 * time.Millisecond, - MaxInFlightTasks: 10, - } -} - -func NewConfig(urls []string) *Config { - cfg := DefaultConfig() - cfg.PubUrls = urls - return cfg -} - -type Publisher struct { - store *http.Client - conf Config - - processed *freecache.Cache - - taskCh chan Task - quitCh chan bool - executionTicker *time.Ticker -} - -type Task interface { - Do() -} - -func NewPublisher(conf Config) (*Publisher, error) { - QedPublisherInstancesCount.Inc() - publisher := Publisher{ - store: &http.Client{}, - conf: conf, - processed: freecache.NewCache(1 << 20), - taskCh: make(chan Task, 100), - quitCh: make(chan bool), - } - - publisher.executionTicker = time.NewTicker(conf.TaskExecutionInterval) - go publisher.runTaskDispatcher() - - return &publisher, nil -} - -func (p Publisher) RegisterMetrics(srv *metrics.Server) { - metrics := []prometheus.Collector{ - QedPublisherInstancesCount, - QedPublisherBatchesReceivedTotal, - QedPublisherBatchesProcessSeconds, - } - srv.MustRegister(metrics...) -} - -func (p *Publisher) Process(b *protocol.BatchSnapshots) { - QedPublisherBatchesReceivedTotal.Inc() - timer := prometheus.NewTimer(QedPublisherBatchesProcessSeconds) - defer timer.ObserveDuration() - var batch protocol.BatchSnapshots - - for _, signedSnap := range b.Snapshots { - _, err := p.processed.Get(signedSnap.Signature) - if err != nil { - _ = p.processed.Set(signedSnap.Signature, []byte{0x0}, 0) - batch.Snapshots = append(batch.Snapshots, signedSnap) - } - } - - if len(batch.Snapshots) < 1 { - return - } - - batch.From = b.From - batch.TTL = b.TTL - - task := &PublishTask{ - store: p.store, - pubUrl: p.conf.PubUrls[0], - taskCh: p.taskCh, - batch: &batch, - } - p.taskCh <- task -} - -func (p Publisher) runTaskDispatcher() { - for { - select { - case <-p.executionTicker.C: - go p.dispatchTasks() - case <-p.quitCh: - p.executionTicker.Stop() - return - } - } -} - -func (p *Publisher) Shutdown() { - QedPublisherInstancesCount.Dec() - - p.executionTicker.Stop() - p.quitCh <- true - close(p.quitCh) - close(p.taskCh) - log.Debugf("Publisher stopped.") -} - -func (p Publisher) dispatchTasks() { - count := 0 - var task Task - - for { - select { - case task = <-p.taskCh: - go task.Do() - count++ - default: - return - } - if count >= p.conf.MaxInFlightTasks { - return - } - } -} - -type PublishTask struct { - store *http.Client - pubUrl string - batch *protocol.BatchSnapshots - taskCh chan Task -} - -func (t PublishTask) Do() { - log.Debugf("Publisher is going to execute task: %+v", t) - buf, err := t.batch.Encode() - if err != nil { - log.Debug("Publisher had an error marshalling: %s\n", err.Error()) - return - } - resp, err := t.store.Post(t.pubUrl+"/batch", "application/json", bytes.NewBuffer(buf)) - if err != nil { - log.Infof("Publisher had an error saving batch in snapStore: %v", err) - t.taskCh <- t - return - } - defer resp.Body.Close() - _, err = io.Copy(ioutil.Discard, resp.Body) - if err != nil { - log.Infof("Publisher had an error getting response from snapStore saving a batch: %v", err) - } -} diff --git a/gossip/store.go b/gossip/store.go index a75c445f6..b400e028d 100644 --- a/gossip/store.go +++ b/gossip/store.go @@ -2,17 +2,173 @@ package gossip import ( "bytes" + "fmt" + "io" + "io/ioutil" + "math/rand" + "net" + "net/http" + "strconv" + "time" + "github.com/bbva/qed/log" "github.com/bbva/qed/protocol" "github.com/bbva/qed/util" "github.com/google/btree" ) -type LocalStore interface { - Put(version uint64, snapshot protocol.Snapshot) error - GetRange(start, end uint64) ([]protocol.Snapshot, error) +type SnapshotStore interface { + PutBatch(b *protocol.BatchSnapshots) error + PutSnapshot(version uint64, snapshot *protocol.SignedSnapshot) error + GetRange(start, end uint64) ([]protocol.SignedSnapshot, error) + GetSnapshot(version uint64) (*protocol.SignedSnapshot, error) DeleteRange(start, end uint64) error + Count() (uint64, error) +} + +// Implements access to a snapshot store +// in a http rest service. +// The process of sending the notifications is +// asynchronous, so a start and stop method is +type RestSnapshotStore struct { + endpoint []string + client *http.Client +} + +//RestSnapshotStore configuration object used to parse +//cli options and to build the SimpleNotifier instance +type RestSnapshotStoreConfig struct { + Endpoint []string `desc:"REST snapshot store service endpoint list http://ip1:port1/path1,http://ip2:port2/path2... "` + DialTimeout time.Duration `desc:"Timeout dialing the REST snapshot store service"` + ReadTimeout time.Duration `desc:"Timeout reading the REST snapshot store service response"` +} + +func NewRestSnapshotStoreFromConfig(c *RestSnapshotStoreConfig) *RestSnapshotStore { + return NewRestSnapshotStore(c.Endpoint, c.DialTimeout, c.ReadTimeout) +} + +func DefaultRestSnapshotStoreConfig() *RestSnapshotStoreConfig { + return &RestSnapshotStoreConfig{ + DialTimeout: 200 * time.Millisecond, + ReadTimeout: 200 * time.Millisecond, + } + +} + +// Returns a new RestSnapshotStore client +func NewRestSnapshotStore(endpoint []string, dialTimeout, readTimeout time.Duration) *RestSnapshotStore { + client := &http.Client{ + Transport: &http.Transport{ + Dial: func(netw, addr string) (net.Conn, error) { + // timeout calling the server + conn, err := net.DialTimeout(netw, addr, dialTimeout) + if err != nil { + return nil, err + } + // timeout reading from the connection + conn.SetDeadline(time.Now().Add(readTimeout)) + return conn, nil + }, + }} + + return &RestSnapshotStore{ + endpoint: endpoint, + client: client, + } +} + +// Stores a batch int he store +func (r *RestSnapshotStore) PutBatch(b *protocol.BatchSnapshots) error { + buf, err := b.Encode() + if err != nil { + return err + } + n := len(r.endpoint) + if n == 0 { + log.Errorf("No endpoint configured for snapshot store!") + } + url := r.endpoint[0] + if n > 1 { + url = r.endpoint[rand.Intn(n)] + } + resp, err := r.client.Post(url+"/batch", "application/json", bytes.NewBuffer(buf)) + if err != nil { + return err + } + defer resp.Body.Close() + _, err = io.Copy(ioutil.Discard, resp.Body) + if err != nil { + return err + } + + return nil +} + +func (r *RestSnapshotStore) PutSnapshot(version uint64, snapshot *protocol.SignedSnapshot) error { + panic("not implemented") +} + +func (r *RestSnapshotStore) GetRange(start uint64, end uint64) ([]protocol.SignedSnapshot, error) { + panic("not implemented") +} + +func (r *RestSnapshotStore) GetSnapshot(version uint64) (*protocol.SignedSnapshot, error) { + n := len(r.endpoint) + url := r.endpoint[0] + if n > 1 { + url = r.endpoint[rand.Intn(n)] + } + resp, err := r.client.Get(fmt.Sprintf("%s/snapshot?v=%d", url, version)) + if err != nil { + return nil, fmt.Errorf("Error getting snapshot %d from store because %v", version, err) + } + + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Error getting snapshot from the store. Status: %d", resp.StatusCode) + } + buf, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, err + } + var s protocol.SignedSnapshot + err = s.Decode(buf) + if err != nil { + return nil, fmt.Errorf("Error decoding signed snapshot %d codec", s.Snapshot.Version) + } + return &s, nil +} + +func (r *RestSnapshotStore) DeleteRange(start uint64, end uint64) error { + panic("not implemented") +} + +func (r *RestSnapshotStore) Count() (uint64, error) { + n := len(r.endpoint) + url := r.endpoint[0] + if n > 1 { + url = r.endpoint[rand.Intn(n)] + } + resp, err := r.client.Get(url + "/count") + if err != nil { + return 0, err + } + + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("Error getting snapshot from the store. Status: %d", resp.StatusCode) + } + buf, err := ioutil.ReadAll(resp.Body) + if err != nil { + return 0, err + } + count, err := strconv.ParseUint(string(buf), 10, 64) + if err != nil { + return 0, fmt.Errorf("Error parsin store response: got %d", resp.StatusCode) + } + + return count, nil } type BPlusTreeStore struct { @@ -27,7 +183,11 @@ func (p StoreItem) Less(b btree.Item) bool { return bytes.Compare(p.Key, b.(StoreItem).Key) < 0 } -func (s *BPlusTreeStore) Put(version uint64, snapshot protocol.Snapshot) error { +func (s *BPlusTreeStore) Count() (uint64, error) { + panic("not implemented") +} + +func (s *BPlusTreeStore) PutSnapshot(version uint64, snapshot protocol.Snapshot) error { encoded, err := snapshot.Encode() if err != nil { return err diff --git a/gossip/store_test.go b/gossip/store_test.go new file mode 100644 index 000000000..e08b174f1 --- /dev/null +++ b/gossip/store_test.go @@ -0,0 +1,51 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package gossip + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/bbva/qed/protocol" + "github.com/stretchr/testify/require" +) + +func TestDefaultStore(t *testing.T) { + var called bool + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" { + w.Header().Set("Allow", "POST") + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + called = true + w.WriteHeader(http.StatusNoContent) + })) + + defer server.Close() + conf := DefaultRestSnapshotStoreConfig() + conf.Endpoint = append(conf.Endpoint, server.URL) + store := NewRestSnapshotStoreFromConfig(conf) + + store.PutBatch(&protocol.BatchSnapshots{}) + + time.Sleep(1 * time.Second) + + require.True(t, called, "Server must be called from store") +} diff --git a/gossip/taskmanager.go b/gossip/taskmanager.go new file mode 100644 index 000000000..7626741d4 --- /dev/null +++ b/gossip/taskmanager.go @@ -0,0 +1,191 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "context" + "fmt" + "time" + + "github.com/bbva/qed/log" + "github.com/bbva/qed/protocol" + "github.com/prometheus/client_golang/prometheus" +) + +// Task are executed by the TasksManager loop +// calling to their Do method. +// All tasks must have their own context already +// when they are enqueued into the task manager. +// +// For example: +// // complies with the Processor interface +// func TaskFactory(ctx contex.Context) Task { +// a := ctx.Value("agent").(gossip.Agent) +// b := ctx.Value("batch").(*protocol.BatchSnapshots) +// return func() error { +// fmt.Println(a.Send(b)) +// return nil +// } +// } +type Task func() error + +// A task factory builds tasks with the +// provided context information. +// +// The context contains the agent API +// and the context added by the message +// processor. +// +// The implenetor of the task factory must know +// the details of the data included by the processor +// in the context. +type TaskFactory interface { + New(context.Context) Task + Metrics() []prometheus.Collector +} + +// TasksManager executes enqueued tasks, It is in charge +// of applying limits to task execution such as timeouts. +// It only has an API to stop and start the tasks execution +// loop. +type TasksManager interface { + Start() + Stop() + Add(t Task) error + Len() int +} + +//SimpleTasksManager configuration object used to parse +//cli options and to build the SimpleNotifier instance +type SimpleTasksManagerConfig struct { + Interval time.Duration `desc:"Interval to execute enqueued tasks"` + MaxTasks int `desc:"Maximum number of concurrent tasks"` +} + +// Returns the default configuration for the SimpleTasksManager +func DefaultSimpleTasksManagerConfig() *SimpleTasksManagerConfig { + return &SimpleTasksManagerConfig{ + Interval: 200 * time.Millisecond, + MaxTasks: 10, + } +} + +func NewSimpleTasksManagerFromConfig(c *SimpleTasksManagerConfig) *SimpleTasksManager { + return NewSimpleTasksManager(c.Interval, c.MaxTasks) +} + +// Simple implementation of a task manager used +// by the QED provided agents +type SimpleTasksManager struct { + taskCh chan Task + quitCh chan bool + ticker *time.Ticker + timeout time.Duration + maxTasks int +} + +// NewTasksManager returns a new TasksManager and its task +// channel. The execution loop will try to execute up to maxTasks tasks +// each interval. Also the channel has maxTasks capacity. +func NewSimpleTasksManager(i time.Duration, max int) *SimpleTasksManager { + return &SimpleTasksManager{ + taskCh: make(chan Task, max), + quitCh: make(chan bool), + ticker: time.NewTicker(i), + maxTasks: max, + } +} + +// Start activates the task dispatcher +// to execute enqueued tasks +func (t *SimpleTasksManager) Start() { + go func() { + for { + select { + case <-t.ticker.C: + go t.dispatchTasks() + case <-t.quitCh: + return + } + } + }() +} + +// Stop disables the task dispatcher +// It does not wait to empty the +// task queue nor closes the task channel. +func (t *SimpleTasksManager) Stop() { + close(t.quitCh) + t.ticker.Stop() +} + +// Add a task to the task manager queue, with the configured timed +// out. It will block until the task is read if the channel is full. +func (t *SimpleTasksManager) Add(task Task) error { + t.taskCh <- task + return nil +} + +// Len returns the number of pending tasks +// enqueued in the tasks channel +func (t *SimpleTasksManager) Len() int { + return len(t.taskCh) +} + +// dispatchTasks dequeues tasks and +// execute them in different goroutines +// up to MaxInFlightTasks +func (t *SimpleTasksManager) dispatchTasks() { + count := 0 + + for { + select { + case task := <-t.taskCh: + go func() { + err := task() + if err != nil { + log.Infof("Task manager got an error from a task: %v", err) + } + }() + count++ + default: + return + } + if count >= t.maxTasks { + return + } + } +} + +// PrinterFactory create tasks than print BatchSnapshots +// for testing purposes. Its intented to be used with the +// BatchSnapshot processor +type PrinterFactory struct { +} + +func (p PrinterFactory) Metrics() []prometheus.Collector { + return nil +} + +func (p PrinterFactory) New(ctx context.Context) Task { + // a := ctx.Value("agent").(Agent) + fmt.Println("PrinterFactory creating new Task!") + b := ctx.Value("batch").(*protocol.BatchSnapshots) + return func() error { + fmt.Printf("Printer Task: agent received batch: %+v\n", b) + return nil + } +} diff --git a/gossip/taskmanager_test.go b/gossip/taskmanager_test.go new file mode 100644 index 000000000..908023047 --- /dev/null +++ b/gossip/taskmanager_test.go @@ -0,0 +1,37 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package gossip + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestRunLen(t *testing.T) { + tm := NewSimpleTasksManager(100*time.Millisecond, 1) + tm.Start() + executions := 0 + tm.Add(func() error { + executions++ + return nil + }) + time.Sleep(1 * time.Second) + require.Equal(t, 0, tm.Len(), "Pending tasks must be 0") + tm.Stop() + require.Equal(t, 1, executions, "Executions must be 1") +} diff --git a/gossip/topology.go b/gossip/topology.go index 86670b84a..66a26b44d 100644 --- a/gossip/topology.go +++ b/gossip/topology.go @@ -16,136 +16,64 @@ package gossip import ( - "math/rand" "sync" - - "github.com/bbva/qed/gossip/member" ) -type PeerList struct { - L []*member.Peer -} - -type Filter func(m *member.Peer) bool - -func (l *PeerList) Filter(f Filter) *PeerList { - var b PeerList - b.L = make([]*member.Peer, 0) - for _, x := range l.L { - if f(x) { - b.L = append(b.L, x) - } - } - - return &b -} - -func (l *PeerList) Exclude(list *PeerList) *PeerList { - if list == nil { - return l - } - return l.Filter(func(p *member.Peer) bool { - for _, x := range list.L { - if x.Name == p.Name { - return false - } - } - return true - }) -} - -func (l PeerList) All() PeerList { - return l -} - -func (l *PeerList) Shuffle() *PeerList { - rand.Shuffle(len(l.L), func(i, j int) { - l.L[i], l.L[j] = l.L[j], l.L[i] - }) - return l -} - -func (l *PeerList) Update(m *member.Peer) { - for i, e := range l.L { - if e.Name == m.Name { - l.L[i] = m - return - } - } - l.L = append(l.L, m) -} - -func (l *PeerList) Delete(m *member.Peer) { - for i, e := range l.L { - if e.Name == m.Name { - copy(l.L[i:], l.L[i+1:]) - l.L[len(l.L)-1] = nil - l.L = l.L[:len(l.L)-1] - return - } - } -} - -func (l PeerList) Size() int { - return len(l.L) -} - +// Hold the gossip network information as this node sees it. +// This information can be used to route messages to other nodes. type Topology struct { - m []PeerList + m map[string]*PeerList sync.Mutex } +// Returns a new empty topology func NewTopology() *Topology { - m := make([]PeerList, member.Unknown) - for i := member.Auditor; i < member.Unknown; i++ { - m[i] = PeerList{ - L: make([]*member.Peer, 0), - } - } + m := make(map[string]*PeerList) return &Topology{ m: m, } } -func (t *Topology) Update(p *member.Peer) error { +// Updates the topology with the peer +// information +func (t *Topology) Update(p *Peer) error { t.Lock() defer t.Unlock() - t.m[p.Meta.Role].Update(p) + l, ok := t.m[p.Meta.Role] + if !ok { + t.m[p.Meta.Role] = NewPeerList() + l = t.m[p.Meta.Role] + } + + l.Update(p) return nil } -func (t *Topology) Delete(p *member.Peer) error { +// Deletes a peer from the topology +func (t *Topology) Delete(p *Peer) error { t.Lock() defer t.Unlock() - t.m[p.Meta.Role].Delete(p) + l := t.m[p.Meta.Role] + l.Delete(p) + return nil } -func (t *Topology) Get(kind member.Type) PeerList { +// Returns a list of peers of a given kind +func (t *Topology) Get(kind string) *PeerList { t.Lock() defer t.Unlock() return t.m[kind] } -func (t *Topology) Each(n int, exclude *PeerList) *PeerList { - var b PeerList - - auditors := t.m[member.Auditor].Exclude(exclude).Shuffle() - monitors := t.m[member.Monitor].Exclude(exclude).Shuffle() - publishers := t.m[member.Publisher].Exclude(exclude).Shuffle() +// Returns a peer list of each kind with n elements on each kind, +// Each list is built excluding all the nodes in the list l, shuffling the result, +// and taking the n elements from the head of the list. +func (t *Topology) Each(n int, l *PeerList) *PeerList { + var p PeerList - if len(auditors.L) > n { - auditors.L = auditors.L[:n] + for _, list := range t.m { + p.Append(list.Exclude(l).Shuffle().Take(n)) } - if len(monitors.L) > n { - monitors.L = monitors.L[:n] - } - if len(publishers.L) > n { - publishers.L = publishers.L[:n] - } - b.L = append(b.L, auditors.L...) - b.L = append(b.L, monitors.L...) - b.L = append(b.L, publishers.L...) - - return &b + return &p } diff --git a/gossip/topology_test.go b/gossip/topology_test.go index b8bb65e2c..3e6186b5f 100644 --- a/gossip/topology_test.go +++ b/gossip/topology_test.go @@ -18,167 +18,35 @@ package gossip import ( "fmt" - "strconv" "testing" - "github.com/bbva/qed/gossip/member" "github.com/stretchr/testify/require" ) -func setupPeerList(size int) *PeerList { - peers := make([]*member.Peer, 0) - for i := 0; i < size; i++ { - name := fmt.Sprintf("name%d", i) - port, _ := strconv.Atoi(fmt.Sprintf("900%d", i)) - role := member.Type(i % int(member.Unknown)) - peer := member.NewPeer(name, "127.0.0.1", uint16(port), role) - peers = append(peers, peer) - } - return &PeerList{peers} -} - func setupTopology(size int) *Topology { topology := NewTopology() for i := 0; i < size; i++ { name := fmt.Sprintf("name%d", i) - port, _ := strconv.Atoi(fmt.Sprintf("900%d", i)) - role := member.Type(i % int(member.Unknown)) - peer := member.NewPeer(name, "127.0.0.1", uint16(port), role) + port := uint16(9000 + i) + role := roles[i%len(roles)] + peer := NewPeer(name, "127.0.0.1", port, role) topology.Update(peer) } return topology } -func TestFilterPeerList(t *testing.T) { - list := setupPeerList(10) - - // filter auditor types - filtered := list.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Auditor - }) - - require.Truef(t, list.Size() > filtered.Size(), "The filtered list should have less elements") - for _, e := range filtered.L { - require.Truef(t, member.Auditor == e.Meta.Role, "The role cannot be different to Auditor") - } -} - -func TestExcludePeerList(t *testing.T) { - list := setupPeerList(10) - - // exclude auditors - filtered := list.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Auditor - }) - included := list.Exclude(filtered) - - require.Truef(t, list.Size() > included.Size(), "The included list should have less elements") - for _, e := range included.L { - require.Truef(t, member.Auditor != e.Meta.Role, "The role cannot be Auditor") - } -} - -func TestExcludeNonIncludedPeerList(t *testing.T) { - list := setupPeerList(10) - - // exclude unknown - var uknown PeerList - uknown.L = append(uknown.L, member.NewPeer("uknown", "127.0.0.1", 10000, member.Unknown)) - included := list.Exclude(&uknown) - - require.Truef(t, list.Size() == included.Size(), "The included list should have the same size") - for _, e := range included.L { - require.Truef(t, member.Unknown != e.Meta.Role, "The role cannot be Unknown") - } -} - -func TestAllPeerList(t *testing.T) { - list := setupPeerList(10) - - all := list.All() - - require.Equalf(t, list, &all, "The lists should be equal") -} - -func TestSizePeerList(t *testing.T) { - list := setupPeerList(10) - - require.Equalf(t, 10, list.Size(), "The size should match") -} - -func TestUpdatePeerList(t *testing.T) { - var list PeerList - - list.Update(member.NewPeer("auditor1", "127.0.0.1", 9001, member.Auditor)) - require.Equalf(t, 1, list.Size(), "The size should have been incremented by 1") - - list.Update(member.NewPeer("auditor2", "127.0.0.1", 9002, member.Auditor)) - require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") - - // update the previous one - list.Update(member.NewPeer("auditor2", "127.0.0.1", 9002, member.Auditor)) - require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") - - // update the previous one changing status - p := member.NewPeer("auditor2", "127.0.0.1", 9002, member.Auditor) - p.Status = member.Leaving - list.Update(p) - require.Equalf(t, 2, list.Size(), "The size should have been incremented by 2") - require.Equalf(t, member.Leaving, list.L[1].Status, "The status should have been updated") -} - -func TestDeletePeerList(t *testing.T) { - list := setupPeerList(10) - list2 := setupPeerList(10) - - // filter auditor types - auditors := list.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Auditor - }) - - // delete auditors - for _, e := range auditors.L { - list2.Delete(e) - } - - require.Truef(t, 10 > list2.Size(), "The new list should have less elements") - for _, e := range list2.L { - require.Truef(t, member.Auditor != e.Meta.Role, "The role cannot be Auditor") - } -} - -func TestDeleteNotIncludedPeerList(t *testing.T) { - list := setupPeerList(10) - - list.Delete(member.NewPeer("unknown", "127.0.0.1", 10000, member.Unknown)) - - require.Truef(t, 10 == list.Size(), "The new list should have the same size") - -} - -func TestShufflePeerList(t *testing.T) { - list := setupPeerList(10) - - shuffled := list.Shuffle() - - require.Truef(t, 10 == shuffled.Size(), "The new list should have the same size") - for _, e := range list.L { - require.Containsf(t, shuffled.L, e, "The element should remain in the list") - } -} - func TestUpdateAndDeleteTopology(t *testing.T) { topology := NewTopology() - peer := member.NewPeer("auditor", "127.0.0.1", 9000, member.Auditor) + peer := NewPeer("auditor", "127.0.0.1", 9000, "auditor") topology.Update(peer) - auditors := topology.Get(member.Auditor) + auditors := topology.Get("auditor") require.Truef(t, 1 == auditors.Size(), "The topology must include one auditor") topology.Delete(peer) - auditors = topology.Get(member.Auditor) + auditors = topology.Get("auditor") require.Truef(t, 0 == auditors.Size(), "The topology must include zero auditor") } @@ -190,14 +58,14 @@ func TestEachWithoutExclusionsTopology(t *testing.T) { require.Truef(t, 3 == each.Size(), "It must include only 3 elements") - auditors := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Auditor + auditors := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "auditor" }) - monitors := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Monitor + monitors := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "monitor" }) - publishers := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Publisher + publishers := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "publisher" }) require.Truef(t, 1 == auditors.Size(), "It must include only one auditor") @@ -208,19 +76,19 @@ func TestEachWithoutExclusionsTopology(t *testing.T) { func TestEachWithExclusionsTopology(t *testing.T) { topology := setupTopology(10) - excluded := topology.Get(member.Auditor) - each := topology.Each(1, &excluded) + excluded := topology.Get("auditor") + each := topology.Each(1, excluded) require.Truef(t, 2 == each.Size(), "It must include only 2 elements") - auditors := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Auditor + auditors := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "auditor" }) - monitors := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Monitor + monitors := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "monitor" }) - publishers := each.Filter(func(m *member.Peer) bool { - return m.Meta.Role == member.Publisher + publishers := each.Filter(func(m *Peer) bool { + return m.Meta.Role == "publisher" }) require.Truef(t, 0 == auditors.Size(), "It must not include any auditor") diff --git a/main.go b/main.go index ce6e6465d..445b616ad 100644 --- a/main.go +++ b/main.go @@ -25,8 +25,7 @@ import ( ) func main() { - rootCmd := cmd.NewRootCommand(os.Args[1:]) - if err := rootCmd.Execute(); err != nil { + if err := cmd.Root.Execute(); err != nil { os.Exit(-1) } } diff --git a/metrics/metrics.go b/metrics/metrics.go index 80e57fa33..6d7b989ea 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -74,15 +74,18 @@ func NewServer(addr string) *Server { // Listens on the configured address and blocks until shutdown is called. func (m Server) Start() { - if err := m.server.ListenAndServe(); err != http.ErrServerClosed { - log.Errorf("Can't start metrics HTTP server: %s", err) - } + go func() { + if err := m.server.ListenAndServe(); err != http.ErrServerClosed { + log.Errorf("Can't start metrics HTTP server: %s", err) + } + }() } // Shutdown gracefully shutdowns metrics http server waiting 5 seconds for // connections to be closed. func (m Server) Shutdown() { - ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() m.server.Shutdown(ctx) } diff --git a/protocol/events.go b/protocol/events.go index c8d66d907..481404858 100644 --- a/protocol/events.go +++ b/protocol/events.go @@ -23,7 +23,6 @@ import ( "github.com/bbva/qed/balloon" "github.com/bbva/qed/balloon/history" "github.com/bbva/qed/balloon/hyper" - "github.com/bbva/qed/gossip/member" "github.com/bbva/qed/hashing" "github.com/bbva/qed/util" ) @@ -72,8 +71,6 @@ func (b *SignedSnapshot) Decode(msg []byte) error { type BatchSnapshots struct { Snapshots []*SignedSnapshot - TTL int - From *member.Peer } type Source struct { diff --git a/server/config.go b/server/config.go index 4e2c45d3a..9e8e62c96 100644 --- a/server/config.go +++ b/server/config.go @@ -23,6 +23,9 @@ import ( ) type Config struct { + //Log level + Log string + // Unique identifier to allow connections APIKey string @@ -77,6 +80,7 @@ func DefaultConfig() *Config { currentDir := getCurrentDir() return &Config{ + Log: "info", APIKey: "", NodeID: hostname, HTTPAddr: "127.0.0.1:8800", diff --git a/gossip/sender/sender.go b/server/sender.go similarity index 60% rename from gossip/sender/sender.go rename to server/sender.go index 38fd02b56..c4657173c 100644 --- a/gossip/sender/sender.go +++ b/server/sender.go @@ -14,7 +14,7 @@ limitations under the License. */ -package sender +package server import ( "fmt" @@ -46,52 +46,35 @@ var ( ) type Sender struct { - agent *gossip.Agent - config *Config - signer sign.Signer - out chan *protocol.BatchSnapshots - quit chan bool + agent *gossip.Agent + Interval time.Duration + BatchSize int + NumSenders int + TTL int + signer sign.Signer + quitCh chan bool } -type Config struct { - BatchSize int - BatchInterval time.Duration - NumSenders int - TTL int - EachN int - SendTimer time.Duration -} - -func DefaultConfig() *Config { - return &Config{ - BatchSize: 100, - BatchInterval: 1 * time.Second, - NumSenders: 3, - TTL: 1, - EachN: 1, - SendTimer: 1000 * time.Millisecond, - } -} - -func NewSender(a *gossip.Agent, c *Config, s sign.Signer) *Sender { - QedSenderInstancesCount.Inc() +func NewSender(a *gossip.Agent, s sign.Signer, size, ttl, n int) *Sender { return &Sender{ - agent: a, - config: c, - signer: s, - out: make(chan *protocol.BatchSnapshots, 1<<16), - quit: make(chan bool), + agent: a, + Interval: 100 * time.Millisecond, + BatchSize: size, + NumSenders: n, + TTL: ttl, + signer: s, + quitCh: make(chan bool), } } // Start NumSenders concurrent senders and waits for them // to finish func (s Sender) Start(ch chan *protocol.Snapshot) { - for i := 0; i < s.config.NumSenders; i++ { - log.Debugf("starting sender %d", i) - go s.batcherSender(i, ch, s.quit) + QedSenderInstancesCount.Inc() + for i := 0; i < s.NumSenders; i++ { + log.Debugf("Starting sender %d", i) + go s.batcher(i, ch) } - <-s.quit } func (s Sender) RegisterMetrics(srv *metrics.Server) { @@ -104,8 +87,6 @@ func (s Sender) RegisterMetrics(srv *metrics.Server) { func (s Sender) newBatch() *protocol.BatchSnapshots { return &protocol.BatchSnapshots{ - TTL: s.config.TTL, - From: s.agent.Self, Snapshots: make([]*protocol.SignedSnapshot, 0), } } @@ -114,14 +95,26 @@ func (s Sender) newBatch() *protocol.BatchSnapshots { // to other members of the gossip network. // If the out queue is full, we drop the current batch and pray other sender will // send the batches to the gossip network. -func (s Sender) batcherSender(id int, ch chan *protocol.Snapshot, quit chan bool) { +func (s Sender) batcher(id int, ch chan *protocol.Snapshot) { batch := s.newBatch() for { select { case snap := <-ch: - if len(batch.Snapshots) == s.config.BatchSize { - s.agent.ChTimedSend(batch, s.out) + if len(batch.Snapshots) == s.BatchSize { + payload, err := batch.Encode() + if err != nil { + log.Infof("Error encoding batch, dropping it") + continue + } + + s.agent.Out.Publish(&gossip.Message{ + Kind: gossip.BatchMessageType, + TTL: s.TTL, + Payload: payload, + }) + QedSenderBatchesSentTotal.Inc() + batch = s.newBatch() } ss, err := s.doSign(snap) @@ -129,45 +122,32 @@ func (s Sender) batcherSender(id int, ch chan *protocol.Snapshot, quit chan bool log.Errorf("Failed signing message: %v", err) } batch.Snapshots = append(batch.Snapshots, ss) - case b := <-s.out: - go s.sender(b) - case <-time.After(s.config.SendTimer): + case <-time.After(s.Interval): // send whatever we have on each tick, do not wait // to have complete batches if len(batch.Snapshots) > 0 { - s.agent.ChTimedSend(batch, s.out) + payload, err := batch.Encode() + if err != nil { + log.Infof("Error encoding batch, dropping it") + continue + } + s.agent.Out.Publish(&gossip.Message{ + Kind: gossip.BatchMessageType, + TTL: s.TTL, + Payload: payload, + }) + QedSenderBatchesSentTotal.Inc() batch = s.newBatch() } - case <-quit: + case <-s.quitCh: return } } } -// Send a batch to the peers it selects based on the gossip -// network topology. -// Do not retry sending to faulty agents, and pray other -// sender will. -func (s Sender) sender(batch *protocol.BatchSnapshots) { - msg, _ := batch.Encode() - peers := s.agent.Topology.Each(s.config.EachN, nil) - for _, peer := range peers.L { - QedSenderBatchesSentTotal.Inc() - dst := peer.Node() - - log.Debugf("Sending batch %+v to node %+v\n", batch, dst.Name) - - err := s.agent.Memberlist().SendReliable(dst, msg) - if err != nil { - log.Infof("Failed send message to %+v because: %v", peer, err) - } - } - log.Debugf("Sent batch %+v to nodes %+v\n", batch, peers.L) -} - func (s Sender) Stop() { QedSenderInstancesCount.Dec() - close(s.quit) + close(s.quitCh) } func (s *Sender) doSign(snapshot *protocol.Snapshot) (*protocol.SignedSnapshot, error) { diff --git a/server/server.go b/server/server.go index e56058939..466965502 100644 --- a/server/server.go +++ b/server/server.go @@ -34,15 +34,12 @@ import ( "github.com/bbva/qed/api/apihttp" "github.com/bbva/qed/api/mgmthttp" "github.com/bbva/qed/gossip" - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/gossip/sender" "github.com/bbva/qed/log" "github.com/bbva/qed/metrics" "github.com/bbva/qed/protocol" "github.com/bbva/qed/raftwal" "github.com/bbva/qed/sign" "github.com/bbva/qed/storage/rocks" - "github.com/bbva/qed/util" ) // Server encapsulates the data and login to start/stop a QED server @@ -57,7 +54,7 @@ type Server struct { metricsServer *metrics.Server prometheusRegistry *prometheus.Registry signer sign.Signer - sender *sender.Sender + sender *Sender agent *gossip.Agent snapshotsCh chan *protocol.Snapshot } @@ -125,26 +122,19 @@ func NewServer(conf *Config) (*Server, error) { // Create gossip agent config := gossip.DefaultConfig() config.BindAddr = conf.GossipAddr - config.Role = member.Server + config.Role = "server" config.NodeName = conf.NodeID - server.agent, err = gossip.NewAgent(config, nil, server.metricsServer) + server.agent, err = gossip.NewAgentFromConfig(config) if err != nil { return nil, err } - if len(conf.GossipJoinAddr) > 0 { - _, err = server.agent.Join(conf.GossipJoinAddr) - if err != nil { - return nil, err - } - } - // TODO: add queue size to config - server.snapshotsCh = make(chan *protocol.Snapshot, 2<<16) + server.snapshotsCh = make(chan *protocol.Snapshot, 1<<16) // Create sender - server.sender = sender.NewSender(server.agent, sender.DefaultConfig(), server.signer) + server.sender = NewSender(server.agent, server.signer, 500, 2, 3) // Create RaftBalloon server.raftBalloon, err = raftwal.NewRaftBalloon(conf.RaftPath, conf.RaftAddr, conf.NodeID, store, server.snapshotsCh) @@ -253,14 +243,9 @@ func (s *Server) Start() error { } } - go func() { - log.Debug(" * Starting QED gossip agent.") - s.sender.Start(s.snapshotsCh) - }() - - util.AwaitTermSignal(s.Stop) + s.sender.Start(s.snapshotsCh) - log.Debug("Stopping server, about to exit...") + s.agent.Start() return nil } @@ -292,8 +277,9 @@ func (s *Server) Stop() error { return err } - log.Debugf("Closing QED sender...") - s.sender.Stop() + /* + log.Debugf("Closing QED sender...") + s.sender.Stop() */ close(s.snapshotsCh) log.Debugf("Stopping QED agent...") diff --git a/tests/add_verify b/tests/add_verify deleted file mode 100755 index 20a9c978e..000000000 --- a/tests/add_verify +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -QED="go run ../main.go -l info -k path" - -add_event(){ - local event="$1"; shift - $QED client add --key "${event}" -} - - -#Adding key [ test event ] -#test event -#Received snapshot with values: -# Event: test event -# HyperDigest: a45fe00356dfccb20b8bc9a7c8331d5c0f89c4e70e43ea0dc0cb646a4b29e59b -# HistoryDigest: 444f6e7eee66986752983c1d8952e2f0998488a5b038bed013c55528551eaafa -# Version: 0 - -verify_event() { - local snapshot="$1"; shift - echo "${snapshot}" - local event=$(echo "${snapshot}" | grep "Event: " | awk -F': ' '{print $2;}') - local history=$(echo "${snapshot}" | grep "HistoryDigest" | awk -F': ' '{print $2;}') - local hyper=$(echo "${snapshot}" | grep "HyperDigest: " | awk -F': ' '{print $2;}') - local version=$(echo "${snapshot}" | grep "Version: " | awk -F': ' '{print $2;}') - $QED client membership --historyDigest ${history} --hyperDigest ${hyper} --version ${version} --key ${event} --verify -} - -for i in $(seq 1 1000); do - event=$(cat /dev/urandom | xxd -l 120 -ps -c 120) - snapshot=$(add_event "${event}" "42") - verify_event "${snapshot}" -done - diff --git a/tests/attack_add/attack_add.lua b/tests/attack_add/attack_add.lua deleted file mode 100644 index a1aeb068c..000000000 --- a/tests/attack_add/attack_add.lua +++ /dev/null @@ -1,43 +0,0 @@ --- Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - --- Licensed under the Apache License, Version 2.0 (the "License"); --- you may not use this file except in compliance with the License. --- You may obtain a copy of the License at - --- http://www.apache.org/licenses/LICENSE-2.0 - --- Unless required by applicable law or agreed to in writing, software --- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language governing permissions and --- limitations under the License. - -counter = 0 - -local b='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' -- You will need this for encoding/decoding - --- encoding -function enc(data) - return ((data:gsub('.', function(x) - local r,b='',x:byte() - for i=8,1,-1 do r=r..(b%2^i-b%2^(i-1)>0 and '1' or '0') end - return r; - end)..'0000'):gsub('%d%d%d?%d?%d?%d?', function(x) - if (#x < 6) then return '' end - local c=0 - for i=1,6 do c=c+(x:sub(i,i)=='1' and 2^(6-i) or 0) end - return b:sub(c+1,c+1) - end)..({ '', '==', '=' })[#data%3+1]) -end - -request = function() - path = "/events" - wrk.headers["Api-Key"] = "pepe" - - wrk.method = "POST" - wrk.body = '{ "event": "' .. enc("test event number " .. counter ) .. '"}' - wrk.headers["Content-Type"] = "application/json" - - counter = counter + 1 - return wrk.format(nil, path) -end diff --git a/tests/e2e/agents_test.go b/tests/e2e/agents_test.go deleted file mode 100644 index 008ba46f7..000000000 --- a/tests/e2e/agents_test.go +++ /dev/null @@ -1,85 +0,0 @@ -/* - Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package e2e - -import ( - "fmt" - "io/ioutil" - "net/http" - "testing" - "time" - - "github.com/bbva/qed/protocol" - "github.com/bbva/qed/testutils/rand" - "github.com/bbva/qed/testutils/scope" - assert "github.com/stretchr/testify/require" -) - -func getSnapshot(version uint64) (*protocol.SignedSnapshot, error) { - resp, err := http.Get(fmt.Sprintf("%s/snapshot?v=%d", StoreURL, version)) - if err != nil { - return nil, fmt.Errorf("Error getting snapshot from the store: %v", err) - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("Error getting snapshot from the store. Status: %d", resp.StatusCode) - } - - buf, _ := ioutil.ReadAll(resp.Body) - s := &protocol.SignedSnapshot{} - err = s.Decode(buf) - if err != nil { - return nil, fmt.Errorf("Error decoding signed snapshot %d codec", version) - } - return s, nil -} - -func TestAgents(t *testing.T) { - bStore, aStore := setupStore(t) - bServer, aServer := setupServer(0, "", false, t) - bAuditor, aAuditor := setupAuditor(0, t) - bMonitor, aMonitor := setupMonitor(0, t) - bPublisher, aPublisher := setupPublisher(0, t) - - scenario, let := scope.Scope(t, - merge(bServer, bStore, bPublisher, bAuditor, bMonitor), - merge(aServer, aPublisher, aAuditor, aMonitor, aStore), - ) - - event := rand.RandomString(10) - - scenario("Add one event and check that it has been published without alerts", func() { - var snapshot *protocol.Snapshot - var ss *protocol.SignedSnapshot - var err error - - client := getClient(t, 0) - - let("Add event", func(t *testing.T) { - snapshot, err = client.Add(event) - assert.NoError(t, err) - }) - - let("Get signed snapshot from snapshot public storage", func(t *testing.T) { - retry(3, 1*time.Second, func() error { - ss, err = getSnapshot(0) - return err - }) - assert.NoError(t, err) - assert.Equal(t, snapshot, ss.Snapshot, "Snapshots must be equal") - }) - }) -} diff --git a/tests/e2e/server_test.go b/tests/e2e/server_test.go index 60a71d0e4..d4948483f 100644 --- a/tests/e2e/server_test.go +++ b/tests/e2e/server_test.go @@ -37,7 +37,7 @@ func TestStart(t *testing.T) { let("Query info endpoint", func(t *testing.T) { var resp *http.Response var err error - retry(3, 1*time.Second, func() error { + retry(3, 2*time.Second, func() error { resp, err = doReq("GET", "http://localhost:8800/info", APIKey, nil) return err }) diff --git a/tests/e2e/setup.go b/tests/e2e/setup.go index ffa1fd0c1..b6ce39b85 100644 --- a/tests/e2e/setup.go +++ b/tests/e2e/setup.go @@ -29,13 +29,8 @@ import ( "time" "github.com/bbva/qed/client" - "github.com/bbva/qed/gossip" - "github.com/bbva/qed/gossip/auditor" - "github.com/bbva/qed/gossip/member" - "github.com/bbva/qed/gossip/monitor" - "github.com/bbva/qed/gossip/publisher" - "github.com/bbva/qed/metrics" "github.com/bbva/qed/server" + "github.com/bbva/qed/testutils/notifierstore" "github.com/bbva/qed/testutils/scope" "github.com/pkg/errors" ) @@ -104,146 +99,10 @@ func doReq(method string, url, apiKey string, payload *strings.Reader) (*http.Re return resp, err } -func newAgent(id int, name string, role member.Type, p gossip.Processor, t *testing.T) *gossip.Agent { - agentConf := gossip.DefaultConfig() - agentConf.NodeName = fmt.Sprintf("%s%d", name, id) - - switch role { - case member.Auditor: - agentConf.BindAddr = fmt.Sprintf("127.0.0.1:810%d", id) - agentConf.MetricsAddr = fmt.Sprintf("127.0.0.1:811%d", id) - case member.Monitor: - agentConf.BindAddr = fmt.Sprintf("127.0.0.1:820%d", id) - agentConf.MetricsAddr = fmt.Sprintf("127.0.0.1:821%d", id) - case member.Publisher: - agentConf.BindAddr = fmt.Sprintf("127.0.0.1:830%d", id) - agentConf.MetricsAddr = fmt.Sprintf("127.0.0.1:831%d", id) - } - - agentConf.StartJoin = []string{QEDGossip} - agentConf.EnableCompression = true - agentConf.AlertsUrls = []string{AlertsURL} - agentConf.Role = role - metricsServer := metrics.NewServer(agentConf.MetricsAddr) - agent, err := gossip.NewAgent(agentConf, []gossip.Processor{p}, metricsServer) - if err != nil { - t.Fatalf("Failed to start AGENT %s: %v", name, err) - } - _, _ = agent.Join([]string{QEDGossip}) - return agent -} - -func setupAuditor(id int, t *testing.T) (scope.TestF, scope.TestF) { - var au *auditor.Auditor - var agent *gossip.Agent - var err error - - before := func(t *testing.T) { - auditorConf := auditor.DefaultConfig() - auditorConf.MetricsAddr = fmt.Sprintf("127.0.0.1:710%d", id) - auditorConf.QEDUrls = []string{QEDUrl} - auditorConf.PubUrls = []string{StoreURL} - auditorConf.AlertsUrls = []string{AlertsURL} - auditorConf.APIKey = APIKey - - au, err = auditor.NewAuditor(*auditorConf) - if err != nil { - t.Fatalf("Unable to create a new auditor: %v", err) - } - - agent = newAgent(id, "auditor", member.Auditor, au, t) - } - - after := func(t *testing.T) { - if au != nil { - au.Shutdown() - } - err := agent.Leave() - if err != nil { - t.Fatalf("Unable to shutdown the auditor: %v", err) - } - err = agent.Shutdown() - if err != nil { - t.Fatalf("Unable to shutdown the auditor: %v", err) - } - } - return before, after -} - -func setupMonitor(id int, t *testing.T) (scope.TestF, scope.TestF) { - var mn *monitor.Monitor - var agent *gossip.Agent - var err error - - before := func(t *testing.T) { - monitorConf := monitor.DefaultConfig() - monitorConf.MetricsAddr = fmt.Sprintf("127.0.0.1:720%d", id) - monitorConf.QEDUrls = []string{QEDUrl} - monitorConf.AlertsUrls = []string{AlertsURL} - monitorConf.APIKey = APIKey - - mn, err = monitor.NewMonitor(monitorConf) - if err != nil { - t.Fatalf("Unable to create a new monitor: %v", err) - } - - agent = newAgent(id, "monitor", member.Monitor, mn, t) - } - - after := func(t *testing.T) { - if mn != nil { - mn.Shutdown() - } - err := agent.Leave() - if err != nil { - t.Fatalf("Unable to shutdown the monitor: %v", err) - } - err = agent.Shutdown() - if err != nil { - t.Fatalf("Unable to shutdown the monitor: %v", err) - } - } - return before, after -} - -func setupPublisher(id int, t *testing.T) (scope.TestF, scope.TestF) { - var pu *publisher.Publisher - var agent *gossip.Agent - var err error - - before := func(t *testing.T) { - conf := publisher.DefaultConfig() - conf.MetricsAddr = fmt.Sprintf("127.0.0.1:730%d", id) - conf.PubUrls = []string{StoreURL} - - pu, err = publisher.NewPublisher(*conf) - if err != nil { - t.Fatalf("Unable to create a new publisher: %v", err) - } - - agent = newAgent(id, "publisher", member.Publisher, pu, t) - } - - after := func(t *testing.T) { - if pu != nil { - pu.Shutdown() - } - err := agent.Leave() - if err != nil { - t.Fatalf("Unable to shutdown the publisher: %v", err) - } - err = agent.Shutdown() - if err != nil { - t.Fatalf("Unable to shutdown the publisher: %v", err) - } - } - return before, after -} - func setupStore(t *testing.T) (scope.TestF, scope.TestF) { - var s *Service + var s *notifierstore.Service before := func(t *testing.T) { - s = NewService() + s = notifierstore.NewService() foreground := false s.Start(foreground) } diff --git a/tests/gossip/add.sh b/tests/gossip/add.sh index fd4121705..939cbc923 100644 --- a/tests/gossip/add.sh +++ b/tests/gossip/add.sh @@ -1,3 +1,9 @@ #!/bin/bash -go run $GOPATH/src/github.com/bbva/qed/main.go client add --apikey foo -e http://127.0.0.1:8800 --key key$1 +# client options +CLIENT_CONFIG=() +CLIENT_CONFIG+=("--log debug") +CLIENT_CONFIG+=("--endpoints http://127.0.0.1:8800") +config=$(echo ${CLIENT_CONFIG[@]} | i=0 envsubst ) + +go run $GOPATH/src/github.com/bbva/qed/main.go client add $config --event $1 diff --git a/tests/gossip/membership.sh b/tests/gossip/membership.sh index b1067b775..b0e82cfa4 100644 --- a/tests/gossip/membership.sh +++ b/tests/gossip/membership.sh @@ -1,10 +1,9 @@ #!/bin/bash +# client options +CLIENT_CONFIG=() +CLIENT_CONFIG+=("--log debug") +CLIENT_CONFIG+=("--endpoints http://127.0.0.1:8800") +config=$(echo ${CLIENT_CONFIG[@]} | i=0 envsubst ) -go run $GOPATH/src/github.com/bbva/qed/main.go \ - --apikey my-key \ - client \ - --log info \ - --endpoints http://${QED_LEADER}:8800 \ - membership --key key$1 \ - --version $1 --verify +go run $GOPATH/src/github.com/bbva/qed/main.go client membership $config --event $1 diff --git a/tests/gossip/run_gossip.sh b/tests/gossip/run_gossip.sh deleted file mode 100755 index fc254785d..000000000 --- a/tests/gossip/run_gossip.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -qedGossipEndpoint="127.0.0.1:8400" -snapshotStoreEndpoint="http://127.0.0.1:8888" -alertsStoreEndpoint="http://127.0.0.1:8888" -qedHTTPEndpoint="http://127.0.0.1:8800" -keyFile="/var/tmp/id_ed25519" -QED="go run $GOPATH/src/github.com/bbva/qed/main.go" - -if [ ! -f "$keyFile" ]; then - echo Create id_ed25519 key - echo -e 'y\n' | ssh-keygen -t ed25519 -N '' -f /var/tmp/id_ed25519 -fi - -xterm -hold -e "$QED start -k key -l debug -p $(mktemp -d) --node-id server0 --raft-addr 127.0.0.1:8500 --gossip-addr 127.0.0.1:8400 --mgmt-addr 127.0.0.1:8700 --metrics-addr 127.0.0.1:8600 --http-addr 127.0.0.1:8800 --keypath $keyFile" & -pids[0]=$! - -sleep 3s - -xterm -hold -e "$QED start -k key -l debug -p $(mktemp -d) --node-id server1 --gossip-addr 127.0.0.2:8401 --raft-addr 127.0.0.2:8501 --keypath $keyFile --join-addr 127.0.0.1:8700 --gossip-join-addr 127.0.0.1:8400 --http-addr 127.0.0.2:8801 --mgmt-addr 127.0.0.2:8701 --metrics-addr 127.0.0.2:8601" & -pids+=($!) - -sleep 2s - -for i in `seq 1 $1`; -do - xterm -hold -e "$QED agent --metrics 127.0.0.2:1810$i auditor -k key -l debug --bind 127.0.0.1:810$i --join $qedGossipEndpoint --qedUrls $qedHTTPEndpoint --pubUrls $snapshotStoreEndpoint --node auditor$i --alertsUrls $alertsStoreEndpoint" & - pids+=($!) -done - -for i in `seq 1 $2`; -do - xterm -hold -e "$QED agent --metrics 127.0.0.2:1820$i --alertsUrls $alertsStoreEndpoint monitor -k key -l debug --bind 127.0.0.1:820$i --join $qedGossipEndpoint --qedUrls $qedHTTPEndpoint --node monitor$i " & - pids+=($!) -done - -for i in `seq 1 $3`; -do - xterm -hold -e "$QED agent --metrics 127.0.0.2:1830$i --alertsUrls $alertsStoreEndpoint publisher -k key -l debug --bind 127.0.0.1:830$i --join $qedGossipEndpoint --pubUrls $snapshotStoreEndpoint --node publisher$i " & - pids+=($!) -done - -for pid in ${pids[*]}; do - echo waiting for pid $pid - wait $pid -done diff --git a/tests/qedmanager b/tests/qedmanager deleted file mode 100644 index 02487193e..000000000 --- a/tests/qedmanager +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -certs="$HOME/.ssh" -qed="go run $GOPATH/src/github.com/bbva/qed/main.go" -service="go run $GOPATH/src/github.com/bbva/qed/tests/gossip/test_service.go" - - -apikey() { echo "--apikey test_key"; } -nodeId() { echo "--node-id ${FUNCNAME[1]}_$1";} -path() { echo "--path $1";} -log() { echo "--log $1";} -keyPath() { echo "--keypath $certs/id_ed25519"; } -httpAddr() { echo "--http-addr 127.0.0.1:88`printf '%02d' $1`"; } -mgmtAddr() { echo "--mgmt-addr 127.0.0.1:87`printf '%02d' $1`"; } -metricsAddr() { echo "--metris-addr 127.0.0.1:86`printf '%02d' $1`";} -raftAddr() { echo "--raft-addr 127.0.0.1:85`printf '%02d' $1`"; } -gossipAddr() { echo "--gossip-addr 127.0.0.1:84`printf '%02d' $1`"; } -joinAddr() { echo "--join-addr 127.0.0.1:8700"; } -gossipJoinAddr() { echo "--gossip-join-addr 127.0.0.1:8400"; } -profiling() { echo "--profiling"; } -cert() { echo "--certificate $certs/server.crt"; } -certKey() { echo "--certificate-key $certs/server.key"; } -tempDir() { mktemp -d -p "/var/tmp"; } - -bindAddr() { echo "--bind 127.0.0.1:8`printf '%d%02d' $1 $2`"; } -alertsUrls() { echo "--alertsUrls http://127.0.0.1:8888"; } -qedUrls() { echo "--qedUrls http://127.0.0.1:8800"; } -pubUrls() { echo "--pubUrl http://127.0.0.1:8888"; } - -qedLeader() { - dir=`tempDir` - echo $qed `apikey` `path $dir` `nodeId 0` `keyPath` `gossipAddr 0` `log error` `cert` `certKey` `profiling` -} - -qedFollower() { - dir=`tempDir` - echo $qed `apikey` `path $dir` `nodeId $1` `keyPath` `gossipAddr $1` `log error` `cert` `certKey` `httpAddr $1` `mgmtAddr $1` `metricsAddr $1` `raftAddr $1` `gossipAddr $1` `joinAddr` `gossipJoinAddr` `cert` `certKey``profiling` -} - - -agent() { - type=$1; shift; - portIndex=$1; shift - id=$1; shift; - echo $qed agent `alertsUrls` $type `apiKey` `log error` `bindAddr $portIndex $id``gossipJoinAddr` `quedUrls` `pubUrls` `nodeId $id` -} - -auditor() { - agent auditor 1 $1 -} - -monitor() { - agent monitor 2 $1 -} - -publisher() { - agent publisher 3 $1 -} - -service() { - echo $service -} - -run() { - name=$1; shift; - cmd=$1; shift; - dir=$1; shift; - $cmd 2>&1 > $dir/$name.log & - echo $! -} - -x=$1; shift -qedCluster=$1; shift; -monitors=$1; shift; -auditors=$1; shift; -publisers=$1; shift; - - diff --git a/tests/start_agent b/tests/start_agent new file mode 100755 index 000000000..40432b773 --- /dev/null +++ b/tests/start_agent @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CGO_LDFLAGS_ALLOW='.*' +QED="go run $GOPATH/src/github.com/bbva/qed/main.go" + +# Agent options +AGENT_CONFIG=() +AGENT_CONFIG+=('--log debug') +AGENT_CONFIG+=('--bind-addr 127.0.0.1:810${i}') +AGENT_CONFIG+=('--metrics-addr 127.0.0.2:1810${i}') +AGENT_CONFIG+=('--start-join 127.0.0.1:8400') + +# Notifier options +NOTIFIER_CONFIG=() +NOTIFIER_CONFIG+=('--notifier-endpoint http://127.0.0.1:8888/alert') + +# Snapshot store options +STORE_CONFIG=() +STORE_CONFIG+=('--store-endpoint http://127.0.0.1:8888') + +# Task manager options +TASKS_CONFIG=() +TASKS_CONFIG+=("") + +# QED client options +QED_CONFIG=() +QED_CONFIG+=("--qed-endpoints http://127.0.0.1:8800") + + +MONITOR_CONFIG=("${AGENT_CONFIG[@]}" "${NOTIFIER_CONFIG[@]}" "${STORE_CONFIG[@]}" "${TASKS_CONFIG[@]}" "${QED_CONFIG[@]}") +MONITOR_CONFIG+=('--role monitor') +MONITOR_CONFIG+=('--node-name monitor${i}') + +PUBLISHER_CONFIG=("${AGENT_CONFIG[@]}" "${NOTIFIER_CONFIG[@]}" "${STORE_CONFIG[@]}" "${TASKS_CONFIG[@]}" ) +PUBLISHER_CONFIG+=('--role publisher') +PUBLISHER_CONFIG+=('--node-name publisher${i}') + +AUDITOR_CONFIG=("${AGENT_CONFIG[@]}" "${NOTIFIER_CONFIG[@]}" "${STORE_CONFIG[@]}" "${TASKS_CONFIG[@]}" "${QED_CONFIG[@]}") +AUDITOR_CONFIG+=('--role auditor') +AUDITOR_CONFIG+=('--node-name auditor${i}') + +start() { + local type="$1" + local id="$2" + + case "$type" in + "monitor") + conf=$(echo "${MONITOR_CONFIG[@]}"| i=$id envsubst ) + ;; + "publisher") + conf=$(echo "${PUBLISHER_CONFIG[@]}"| i=$id envsubst ) + ;; + "auditor") + conf=$(echo "${AUDITOR_CONFIG[@]}"| i=$id envsubst ) + ;; + *) + echo Unknown agent type. Please choose monitor, publisher or auditor + return -1 + ;; + esac + echo CONFIG $conf + mkdir -p /var/tmp/qed-$type-$id/ + $QED agent $type $conf 2>&1 | tee /var/tmp/qed-$type-$id/agent.log & + pids+=($!) +} + +pids=() + +count=0 +for agent in "$@" +do + echo Starting "$agent" "$count" + start "$agent" "$count" + count=$((count+1)) +done + +for pid in ${pids[*]}; do + echo waiting for pid $pid + wait $pid +done diff --git a/tests/start_agents b/tests/start_agents deleted file mode 100755 index cc80f3ebf..000000000 --- a/tests/start_agents +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env sh - -# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -qedGossipEndpoint="127.0.0.1:8400" -alertsStoreEndpoint="http://127.0.0.1:8888" -snapshotStoreEndpoint="http://127.0.0.1:8888" -qedHTTPEndpoint="http://127.0.0.1:8800" -QED="go run $GOPATH/src/github.com/bbva/qed/main.go" - -for i in `seq 1 $1`; -do - $QED agent \ - --alertsUrls $alertsStoreEndpoint \ - auditor \ - -k test_key \ - -l info \ - --bind 127.0.0.1:810$i \ - --join $qedGossipEndpoint \ - --qedUrls $qedHTTPEndpoint \ - --pubUrls $snapshotStoreEndpoint \ - --metrics localhost:1810$i \ - --node auditor$i & -done - -for i in `seq 1 $2`; -do - $QED agent \ - --alertsUrls $alertsStoreEndpoint \ - monitor \ - -k test_key \ - -l info \ - --bind 127.0.0.1:820$i \ - --join $qedGossipEndpoint \ - --qedUrls $qedHTTPEndpoint \ - --metrics localhost:1820$i \ - --node monitor$i & -done - -for i in `seq 1 $3`; -do - $QED agent \ - --alertsUrls $alertsStoreEndpoint \ - publisher \ - -k test_key \ - -l info \ - --bind 127.0.0.1:830$i \ - --join $qedGossipEndpoint \ - --pubUrls $snapshotStoreEndpoint \ - --metrics localhost:1830$i \ - --node publisher$i & -done diff --git a/tests/start_all_xterm b/tests/start_all_xterm new file mode 100755 index 000000000..35377cf64 --- /dev/null +++ b/tests/start_all_xterm @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Starts a local test setup + +export CGO_LDFLAGS_ALLOW='.*' +xterm -hold -e "go run $GOPATH/src/github.com/bbva/qed/testutils/notifierstore.go" & + +xterm -hold -e "bash start_server" & + +sleep 3s + +xterm -hold -e "bash start_agent monitor publisher auditor" & diff --git a/tests/start_server b/tests/start_server index ca180976b..2a44f0275 100755 --- a/tests/start_server +++ b/tests/start_server @@ -1,90 +1,69 @@ -#!/usr/bin/env sh +#!/bin/bash -# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. +# Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Deployment options +keyFile="/var/tmp/id_ed25519" -# http://www.apache.org/licenses/LICENSE-2.0 +if [ ! -f "$keyFile" ]; then + echo Create id_ed25519 key + echo -e 'y\n' | ssh-keygen -t ed25519 -N '' -f /var/tmp/id_ed25519 +fi -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +CGO_LDFLAGS_ALLOW='.*' +QED="go run $GOPATH/src/github.com/bbva/qed/main.go" -echo "export CLUSTER_SIZE=N [2|4] env variable to execute the benchmark in replica mode. N number of folowers" -echo "export PROFILING=true env variable to enable go profiling" -echo "export TLS=true to allow https" +# Server options +LEADER_CONFIG=() +LEADER_CONFIG+=('--log debug') +LEADER_CONFIG+=('--api-key key') +LEADER_CONFIG+=('--db-path /var/tmp/qed${i}/db') +LEADER_CONFIG+=('--gossip-addr 127.0.0.1:840${i}') +LEADER_CONFIG+=('--http-addr 127.0.0.1:880${i}') +LEADER_CONFIG+=('--metrics-addr 127.0.0.1:860${i}') +LEADER_CONFIG+=('--mgmt-addr 127.0.0.1:870${i}') +LEADER_CONFIG+=('--node-id server${i}') +LEADER_CONFIG+=('--private-key-path /var/tmp/id_ed25519') +LEADER_CONFIG+=('--raft-addr 127.0.0.1:850${i}') +LEADER_CONFIG+=('--raft-path /var/tmp/qed${i}/wal') -tdir=$(mktemp -d /var/tmp/demo.XXX) -certs="/var/tmp/certs" +FOLLOWER_CONFIG=("${LEADER_CONFIG[@]}") +FOLLOWER_CONFIG+=('--raft-addr-join 127.0.0.1:8700') +FOLLOWER_CONFIG+=('--gossip-join-addr 127.0.0.1:8400') -LOCAL_IP=127.0.0.1 -if [ ! -z "$PROFILING" ]; then - echo "PROFILING=enabled" - mkdir -p results - PROFILING=--profiling -else - echo PROFILING=disabled -fi +pids=() -if [ ! -z "$TLS" ]; then - TLS="--certificate ${certs}/server.crt \ --certificate-key ${certs}/server.key" -else - echo "TLS=disabled" +n="$1" +if [ -z $n ]; then + n=0 fi -leader() { - mkdir -p ${tdir}/0/ - - go run ../main.go start \ - -k test_key \ - -p ${tdir}/0/ \ - --node-id leader \ - --http-addr ${LOCAL_IP}:8800 \ - --keypath ${certs}/id_ed25519 \ - --gossip-addr ${LOCAL_IP}:8400 \ - -l error \ - $TLS \ - $PROFILING \ - & -} +mkdir -p /var/tmp/qed-log-0 +leader=$(echo ${LEADER_CONFIG[@]} | i=0 envsubst ) +$QED server start $leader 2>&1 | tee /var/tmp/qed-log-0/server.log & +pids+=($!) +sleep 3s -follower() { - mkdir -p ${tdir}/$1 +for id in $(seq 1 1 $n); do + mkdir -p /var/tmp/qed${id} + follower=$(echo ${FOLLOWER_CONFIG[@]} | i=$id envsubst ) + $QED server start $follower 2>&1 | tee /var/tmp/qed-log-${id}/server.log & + pids+=($!) + sleep 3s +done - go run ../main.go start \ - -k test_key \ - --node-id follower_$1 \ - -p ${tdir}/$1 \ - --keypath ${certs}/id_ed25519 \ - -l error \ - --http-addr ${LOCAL_IP}:880$1 \ - --mgmt-addr ${LOCAL_IP}:870$1 \ - --metrics-addr ${LOCAL_IP}:860$1 \ - --raft-addr ${LOCAL_IP}:850$1 \ - --gossip-addr ${LOCAL_IP}:840$1 \ - --join-addr ${LOCAL_IP}:8700 \ - --gossip-join-addr ${LOCAL_IP}:8400 \ - $TLS \ - $PROFILING \ - & -} - -if [ -z "$CLUSTER_SIZE" ] -then - echo '>>>> Starting single node...' - leader -else - echo '>>>> Starting cluster mode...' - leader - sleep 5 - echo '>>>> Starting followers' - for i in $(seq 1 $CLUSTER_SIZE); do - follower ${i} - done -fi -sleep 10 -echo '>>>> done.' +for pid in ${pids[*]}; do + echo waiting for pid $pid + wait $pid +done diff --git a/tests/gossip/test_service.go b/testutils/notifierstore.go similarity index 88% rename from tests/gossip/test_service.go rename to testutils/notifierstore.go index b52e5444f..ce4f17bde 100644 --- a/tests/gossip/test_service.go +++ b/testutils/notifierstore.go @@ -3,7 +3,9 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,10 +15,10 @@ package main -import "github.com/bbva/qed/tests/e2e" +import "github.com/bbva/qed/testutils/notifierstore" func main() { - s := e2e.NewService() + s := notifierstore.NewService() foreground := true s.Start(foreground) } diff --git a/tests/e2e/test_service.go b/testutils/notifierstore/api.go similarity index 92% rename from tests/e2e/test_service.go rename to testutils/notifierstore/api.go index 8de2291be..d6984c7eb 100644 --- a/tests/e2e/test_service.go +++ b/testutils/notifierstore/api.go @@ -14,7 +14,7 @@ limitations under the License. */ -package e2e +package notifierstore import ( "context" @@ -156,6 +156,10 @@ func (s *snapStore) Get(version uint64) (*protocol.SignedSnapshot, error) { return &snap, nil } +func (s *snapStore) Count() uint64 { + return uint64(s.data.EntryCount()) +} + type Service struct { snaps *snapStore alerts *alertStore @@ -196,10 +200,12 @@ func (s *Service) Start(foreground bool) { // Snapshot/alert store server. router := http.NewServeMux() router.HandleFunc("/batch", s.postBatchHandler()) + router.HandleFunc("/count", s.getSnapshotCountHandler()) router.HandleFunc("/snapshot", s.getSnapshotHandler()) router.HandleFunc("/alert", s.alertHandler()) - s.httpServer = &http.Server{Addr: ":8888", Handler: router} + s.httpServer = newHttpServer(":8888", router, log.GetLogger()) + fmt.Println("Starting test service...") go func() { @@ -302,6 +308,22 @@ func (s *Service) getSnapshotHandler() func(http.ResponseWriter, *http.Request) } } +func (s *Service) getSnapshotCountHandler() func(http.ResponseWriter, *http.Request) { + return func(w http.ResponseWriter, r *http.Request) { + if r.Method == "GET" { + QedStoreSnapshotsRetrievedTotal.Inc() + count := s.snaps.Count() + + _, err := w.Write([]byte(fmt.Sprintf("%d", count))) + if err != nil { + fmt.Printf("ERROR: %v", err) + } + return + } + http.Error(w, "Invalid request method", http.StatusMethodNotAllowed) + } +} + func (s *Service) alertHandler() func(http.ResponseWriter, *http.Request) { return func(w http.ResponseWriter, r *http.Request) { if r.Method == "GET" { diff --git a/testutils/notifierstore/httplog.go b/testutils/notifierstore/httplog.go new file mode 100644 index 000000000..3099b26b8 --- /dev/null +++ b/testutils/notifierstore/httplog.go @@ -0,0 +1,78 @@ +/* + Copyright 2018-2019 Banco Bilbao Vizcaya Argentaria, S.A. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Derived from https://gist.github.com/enricofoltran/10b4a980cd07cb02836f70a4ab3e72d7 +// unlicesed code + +package notifierstore + +import ( + "context" + "fmt" + "log" + "net/http" + "time" +) + +type k int + +const ( + requestIDKey k = 0 +) + +func newHttpServer(listenAddr string, router *http.ServeMux, logger *log.Logger) *http.Server { + nextRequestID := func() string { + return fmt.Sprintf("%d", time.Now().UnixNano()) + } + + return &http.Server{ + Addr: listenAddr, + Handler: tracing(nextRequestID)(logging(logger)(router)), + ErrorLog: logger, + ReadTimeout: 5 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 15 * time.Second, + } +} + +func logging(logger *log.Logger) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + defer func() { + requestID, ok := r.Context().Value(requestIDKey).(string) + if !ok { + requestID = "unknown" + } + logger.Println(requestID, r.Method, r.URL.Path, r.RemoteAddr, r.UserAgent()) + }() + next.ServeHTTP(w, r) + }) + } +} + +func tracing(nextRequestID func() string) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestID := r.Header.Get("X-Request-Id") + if requestID == "" { + requestID = nextRequestID() + } + ctx := context.WithValue(r.Context(), requestIDKey, requestID) + w.Header().Set("X-Request-Id", requestID) + next.ServeHTTP(w, r.WithContext(ctx)) + }) + } +}