SEANK.H.LIAO

tracing spans as logs

a common event system?

spans as logs

In observability, (distributed) tracing with its spans covering a range of time, has quite a significant overlap with logging. This is especially true if you buy into wide events where each span or log line carries a large amount of extra information in structured attributes.

All this got me wondering, why should I log everything two or more times? For example, in Go you pass the error to tracing, to logs, and maybe pass it back up the stack (whether you both log and pass back up is debatable):

 1func foo(ctx context.Context) error {
 2        ctx, span := tracer.Start(ctx, "foo")
 3        defer span.End()
 4
 5        err := bar("fizzz")
 6        if err != nil {
 7                span.RecordError(err, trace.WithAttributes(
 8                        attribute.String("arg", "fizzz")
 9                ))
10                span.SetStatus(codes.Error, "doing bar failed")
11                slog.Debug("doing bar failed", "err", err, slog.String("arg", "fizzz"))
12                return fmt.Errorf("doing bar failed: %w", err)
13        }
14
15        return nil
16}

What I also found to be an inconvenience was the differing attribute systems between tracing with OpenTelemetry and logging with log/slog. So it occurred to me that if I pass the same information to both, I could drive the lower fidelty logging from the information I pass to tracing.

spanlog processor

decisions

The first point of decision is do we make a span exporter or processor? It didn't take too much thinking to conclude that a processor was more appropriate since it was somewhat closer to realtime rather than receiving batched events for export. This was something I considered important with logging as you want logs at the point of a crash.

The second point was do we take a slog.Logger or slog.Handler? I started with Logger, but soon realized I needed use the time recorded from events, so switched to Handler.

And finally, what level do we log at? I went with debug for most, info for span ends, and warn/error when a span was marked for error.

problems

I think the output most works, but I did see some issues. First was the (potentially) out of order output of events in parallel: events are only processed on span end, so they may be delayed by quite a bit before they are output. Second was it was unclear how to log singular events, e.g. I like to log the port when an http server starts, but since an http server blocks until the end, where do I put a span. Plus, all the other systems that expect a simple logger, what do I give them (so far, it ends up being we still need a normal logger).

implementation
  1package spanlog
  2
  3import (
  4        "context"
  5        "log/slog"
  6        "path"
  7
  8        "go.opentelemetry.io/otel/attribute"
  9        "go.opentelemetry.io/otel/codes"
 10        "go.opentelemetry.io/otel/sdk/trace"
 11        semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
 12)
 13
 14var _ trace.SpanProcessor = &Processor{}
 15
 16// Processor is a [go.opentelemetry.io/otel/sdk/trace.SpanProcessor]
 17// that converts span start/end and events into log lines.
 18// Span start is [log/slog.LevelDebug].
 19// Span end is [log/slog.LevelInfo] or [log/slog.LevelError] if the span status is
 20// [go.opentelemetry.io/otel/codes.Error].
 21// Events are normally [log/slog.LevelDebug], and [log/slog.LevelWarn] if the span status is
 22// [go.opentelemetry.io/otel/codes.Error],
 23// and [log/slog.LevelError] if created by [go.opentelemetry.io/otel/trace.Span.RecordError].
 24type Processor struct {
 25        // LogID controls whether span_id and trace_id are logged
 26        LogID bool
 27        // Handler is where the output is sent.
 28        // Span metadata such as trace_id, span_id, span type, duration, etc.
 29        // are grouped under "span".
 30        // Span names are used as the message,
 31        // span attributes grouped under the instrumentation scope base name.
 32        Handler slog.Handler
 33}
 34
 35func (p *Processor) appendCommon(attrs []slog.Attr, s trace.ReadOnlySpan) []slog.Attr {
 36        if p.LogID {
 37                attrs = append(attrs,
 38                        slog.String("trace_id", s.SpanContext().TraceID().String()),
 39                        slog.String("span_id", s.SpanContext().SpanID().String()),
 40                )
 41        }
 42        return attrs
 43}
 44
 45func (p *Processor) OnStart(parent context.Context, s trace.ReadWriteSpan) {
 46        ctx := context.Background()
 47        // span start: debug
 48        if p.Handler.Enabled(ctx, slog.LevelDebug) {
 49                attrs := p.appendCommon(make([]slog.Attr, 0, 10), s)
 50                attrs = append(attrs,
 51                        slog.Group("span", slog.String("type", "start")),
 52                        slog.Any(path.Base(s.InstrumentationScope().Name), slog.GroupValue(attr2attr(s.Attributes())...)),
 53                )
 54                rec := slog.NewRecord(s.StartTime(), slog.LevelDebug, s.Name(), 0)
 55                rec.AddAttrs(attrs...)
 56                p.Handler.Handle(ctx, rec)
 57        }
 58}
 59
 60func (p *Processor) OnEnd(s trace.ReadOnlySpan) {
 61        ctx := context.Background()
 62
 63        // events: debug or warn
 64        level := slog.LevelDebug
 65        if s.Status().Code == codes.Error {
 66                level = slog.LevelWarn
 67        }
 68        if p.Handler.Enabled(ctx, level) {
 69                attrs := p.appendCommon(make([]slog.Attr, 0, 10), s)
 70                attrs = append(attrs, slog.Group("span", slog.String("type", "event")))
 71                attrs = p.appendCommon(attrs, s)
 72                h := p.Handler.WithAttrs(attrs).WithGroup(path.Base(s.InstrumentationScope().Name))
 73                for _, event := range s.Events() {
 74                        level := level
 75                        if event.Name == semconv.ExceptionEventName {
 76                                level = slog.LevelError
 77                        }
 78                        rec := slog.NewRecord(event.Time, level, event.Name, 0)
 79
 80                        rec.AddAttrs(attr2attr(event.Attributes)...)
 81                        h.Handle(ctx, rec)
 82                }
 83        }
 84
 85        // span end: info or error
 86        level = slog.LevelInfo
 87        if s.Status().Code == codes.Error {
 88                level = slog.LevelError
 89        }
 90        if p.Handler.Enabled(ctx, level) {
 91                spanAttrs := make([]slog.Attr, 0, 3)
 92                spanAttrs = append(spanAttrs,
 93                        slog.String("type", "end"),
 94                        slog.Duration("duration", s.EndTime().Sub(s.StartTime())),
 95                )
 96                if s.Status().Code != codes.Unset {
 97                        spanAttrs = append(spanAttrs, slog.Group("status",
 98                                slog.String("code", s.Status().Code.String()),
 99                                slog.String("description", s.Status().Description),
100                        ))
101                }
102
103                attrs := p.appendCommon(make([]slog.Attr, 0, 10), s)
104                attrs = append(attrs,
105                        slog.Any("span", slog.GroupValue(spanAttrs...)),
106                        slog.Any(path.Base(s.InstrumentationScope().Name), slog.GroupValue(attr2attr(s.Attributes())...)),
107                )
108                rec := slog.NewRecord(s.EndTime(), level, s.Name(), 0)
109                rec.AddAttrs(attrs...)
110                p.Handler.Handle(ctx, rec)
111        }
112}
113
114func (p *Processor) Shutdown(ctx context.Context) error   { return nil }
115func (p *Processor) ForceFlush(ctx context.Context) error { return nil }
116
117func attr2attr(attrs []attribute.KeyValue) []slog.Attr {
118        out := make([]slog.Attr, 0, len(attrs))
119        for _, attr := range attrs {
120                switch attr.Value.Type() {
121                case attribute.BOOL:
122                        out = append(out, slog.Bool(string(attr.Key), attr.Value.AsBool()))
123                case attribute.INT64:
124                        out = append(out, slog.Int64(string(attr.Key), attr.Value.AsInt64()))
125                case attribute.FLOAT64:
126                        out = append(out, slog.Float64(string(attr.Key), attr.Value.AsFloat64()))
127                case attribute.STRING:
128                        out = append(out, slog.String(string(attr.Key), attr.Value.AsString()))
129                default:
130                        out = append(out, slog.Any(string(attr.Key), attr.Value.AsInterface()))
131                }
132        }
133        return out
134}