collection logs with fluent-bit on kubernetes

Kubernetes 方便的帮我们管理和运维程序，但是也带来了一些新的挑战。比如日志问题。用了Kubernetes 之后，就不能方便的登录到服务器上查看stdout日志，查看系统日志，查看程序的输出日志了。

但是 Kubernetes 有提供更系统化，更高效的日志处理方式。这里就 fluent-bit 做一下记录。

fluent-bit

Fluentd & Fluent Bit

二者都是同一家公司的两个产品，后者更适合向“云”环境。

Fluentd is a log collector, processor, and aggregator.
Fluent Bit is a log collector and processor (it doesn’t have strong aggregation features like Fluentd).

fluentd vs fluentbit

安装

Docker

docker pull fluent/fluent-bit:1.3
# run the following (useless) test which makes Fluent Bit measure CPU usage by the container
docker run -ti fluent/fluent-bit:1.3 /fluent-bit/bin/fluent-bit -i cpu -o stdout -f 1

# out put
Fluent-Bit v1.3.x
Copyright (C) Treasure Data

[2019/10/01 12:29:02] [ info] [engine] started
[0] cpu.0: [1504290543.000487750, {"cpu_p"=>0.750000, "user_p"=>0.250000, "system_p"=>0.500000, "cpu0.p_cpu"=>0.000000, "cpu0.p_user"=>0.000000, "cpu0.p_system"=>0.000000, "cpu1.p_cpu"=>1.000000, "cpu1.p_user"=>0.000000, "cpu1.p_system"=>1.000000, "cpu2.p_cpu"=>1.000000, "cpu2.p_user"=>1.000000, "cpu2.p_system"=>0.000000, "cpu3.p_cpu"=>0.000000, "cpu3.p_user"=>0.000000, "cpu3.p_system"=>0.000000}]

Windows

# 下载
# https://fluentbit.io/releases/1.3/td-agent-bit-1.3.2-1.AMD64.zip
# SHA256 Checksums: 57a75221b6cee25e4a85881f709462702661ab13a97e68b552ecfc1d341184b3
# 检查Hash
PS> Get-FileHash td-agent-bit-1.3.2-1.AMD64.exe

# 执行
PS> .\bin\fluent-bit.exe -i dummy -o stdout

# out put
PS> .\bin\fluent-bit.exe  -i dummy -o stdout
Fluent Bit v1.3.1
Copyright (C) Treasure Data

[2019/06/28 10:13:04] [ info] [storage] initializing...
[2019/06/28 10:13:04] [ info] [storage] in-memory
[2019/06/28 10:13:04] [ info] [storage] normal synchronization mode, checksum disabled, max_chunks_up=128
[2019/06/28 10:13:04] [ info] [engine] started (pid=10324)
[2019/06/28 10:13:04] [ info] [sp] stream processor started
[0] dummy.0: [1561684385.443823800, {"message"=>"dummy"}]
[1] dummy.0: [1561684386.428399000, {"message"=>"dummy"}]
[2] dummy.0: [1561684387.443641900, {"message"=>"dummy"}]
[3] dummy.0: [1561684388.441405800, {"message"=>"dummy"}]

Kubernetes

# https://raw.githubusercontent.com/fluent/fluent-bit-kubernetes-logging/master/output/elasticsearch/fluent-bit-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: fluent-bit-config
  namespace: logging
  labels:
    k8s-app: fluent-bit
data:
  # Configuration files: server, input, filters and output
  # ======================================================
  fluent-bit.conf: |
    [SERVICE]
        Flush         1
        Log_Level     info
        Daemon        off
        Parsers_File  parsers.conf
        HTTP_Server   On
        HTTP_Listen   0.0.0.0
        HTTP_Port     2020

    @INCLUDE input-kubernetes.conf
    @INCLUDE filter-kubernetes.conf
    @INCLUDE output-elasticsearch.conf

  input-kubernetes.conf: |
    [INPUT]
        Name              tail
        Tag               kube.*
        Path              /var/log/containers/*.log
        Parser            docker
        DB                /var/log/flb_kube.db
        Mem_Buf_Limit     5MB
        Skip_Long_Lines   On
        Refresh_Interval  10

  filter-kubernetes.conf: |
    [FILTER]
        Name                kubernetes
        Match               kube.*
        Kube_URL            https://kubernetes.default.svc:443
        Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
        Kube_Tag_Prefix     kube.var.log.containers.
        Merge_Log           On
        Merge_Log_Key       log_processed
        K8S-Logging.Parser  On
        K8S-Logging.Exclude Off

  output-elasticsearch.conf: |
    [OUTPUT]
        Name            es
        Match           *
        Host            ${FLUENT_ELASTICSEARCH_HOST}
        Port            ${FLUENT_ELASTICSEARCH_PORT}
        Logstash_Format On
        Replace_Dots    On
        Retry_Limit     False

  parsers.conf: |
    [PARSER]
        Name   apache
        Format regex
        Regex  ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   apache2
        Format regex
        Regex  ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^ ]*) +\S*)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   apache_error
        Format regex
        Regex  ^\[[^ ]* (?<time>[^\]]*)\] \[(?<level>[^\]]*)\](?: \[pid (?<pid>[^\]]*)\])?( \[client (?<client>[^\]]*)\])? (?<message>.*)$

    [PARSER]
        Name   nginx
        Format regex
        Regex ^(?<remote>[^ ]*) (?<host>[^ ]*) (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   json
        Format json
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name        docker
        Format      json
        Time_Key    time
        Time_Format %Y-%m-%dT%H:%M:%S.%L
        Time_Keep   On

    [PARSER]
        Name        syslog
        Format      regex
        Regex       ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
        Time_Key    time
        Time_Format %b %d %H:%M:%S

基础概念

Default Config

[SERVICE]
    # Flush
    # =====
    # Set an interval of seconds before to flush records to a destination
    Flush        5

    # Daemon
    # ======
    # Instruct Fluent Bit to run in foreground or background mode.
    Daemon       Off

    # Log_Level
    # =========
    # Set the verbosity level of the service, values can be:
    #
    # - error
    # - warning
    # - info
    # - debug
    # - trace
    #
    # By default 'info' is set, that means it includes 'error' and 'warning'.
    Log_Level    info

    # Parsers_File
    # ============
    # Specify an optional 'Parsers' configuration file
    Parsers_File parsers.conf
    Plugins_File plugins.conf

    # HTTP Server
    # ===========
    # Enable/Disable the built-in HTTP Server for metrics
    HTTP_Server  Off
    HTTP_Listen  0.0.0.0
    HTTP_Port    2020

[INPUT]
    Name cpu
    Tag  cpu.local
    # Interval Sec
    # ====
    # Read interval (sec) Default: 1
    Interval_Sec 1

[OUTPUT]
    Name  stdout
    Match *

Schema

Section 不能为空；4空格缩进；必须换行；

配置文件支持拆分和引用

@INCLUDE somefile.conf
@INCLUDE input_*.conf

Note that despites the order of inclusion, Fluent Bit will ALWAYS respect the following order:

Service
Inputs
Filters
Outputs

支持环境变量和自定义变量

# export MY_OUTPUT=stdout
@SET my_input=cpu
@SET my_output=stdout

[SERVICE]
    Name  ${MY_OUTPUT}
    Match *
    Flush 1

[INPUT]
    Name ${my_input}

[OUTPUT]
    Name ${my_output}

Buffering / Storage

that configuration configure an optional buffering mechanism where it root for data is /var/log/flb-storage/, it will use normal synchronization mode, without checksum and up to a maximum of 5MB of memory when processing backlog data.

[SERVICE]
    flush                     1
    log_Level                 info
    storage.path              /var/log/flb-storage/
    storage.sync              normal
    storage.checksum          off
    storage.backlog.mem_limit 5M

# Specify the buffering mechanism to use. It can be memory or filesystem.
[INPUT]
    name          cpu
    storage.type  filesystem

[INPUT]
    name          mem
    storage.type  memory

Metrics

By default configured plugins on runtime get an internal name in the format plugin_name.ID. For monitoring purposes this can be confusing if many plugins of the same type were configured. To make a distinction each configured input or output section can get an alias that will be used as the parent name for the metric.

[SERVICE]
    HTTP_Server  On
    HTTP_Listen  0.0.0.0
    HTTP_PORT    2020

[INPUT]
    Name  cpu
    Alias server1_cpu

[OUTPUT]
    Name  stdout
    Alias raw_output
    Match *

# /api/v1/metrics/prometheus

Upstream Servers

It’s common that Fluent Bit output plugins aims to connect to external services to deliver the logs over the network, this is the case of HTTP, Elasticsearch and Forward within others. Being able to connect to one node (host) is normal and enough for more of the use cases, but there are other scenarios where balancing across different nodes is required. The Upstream feature provides such capability.
An Upstream defines a set of nodes that will be targeted by an output plugin, by the nature of the implementation an output plugin must support the Upstream feature. The following plugin(s) have Upstream support: Forward.
The current balancing mode implemented is round-robin.

# The following example defines an Upstream called forward-balancing which aims to be used by Forward output plugin, it register three Nodes:
[UPSTREAM]
    name       forward-balancing
# node-1: connects to 127.0.0.1:43000
[NODE]
    name       node-1
    host       127.0.0.1
    port       43000
# node-2: connects to 127.0.0.1:44000
[NODE]
    name       node-2
    host       127.0.0.1
    port       44000
# node-3: connects to 127.0.0.1:45000 using TLS without verification. It also defines a specific configuration option required by Forward output called shared_key.
[NODE]
    name       node-3
    host       127.0.0.1
    port       45000
    tls        on
    tls.verify off
    shared_key secret

Scheduler

Fluent Bit has an Engine that helps to coordinate the data ingestion from input plugins and call the Scheduler to decide when is time to flush the data through one or multiple output plugins. The Scheduler flush new data every a fixed time of seconds and Schedule retries when asked.

# The following example configure two outputs where the HTTP plugin have an unlimited number of retries and the Elasticsearch plugin have a limit of 5 times:
[OUTPUT]
    Name        http
    Host        192.168.5.6
    Port        8080
    Retry_Limit False

[OUTPUT]
    Name            es
    Host            192.168.5.20
    Port            9200
    Logstash_Format On
    Retry_Limit     5

Input Plugins Tail

The plugin reads every matched file in the Path pattern and for every new line found (separated by a \n), it generates a new record. Optionally a database file can be used so the plugin can have a history of tracked files and a state of offsets, this is very useful to resume a state if the service is restarted.

[INPUT]
    Name        tail
    Path        /var/log/syslog

[OUTPUT]
    Name   stdout
    Match  *

Parser

By default, Fluent Bit provides a set of pre-configured parsers that can be used for different use cases such as logs from: Apache, Nginx, Docker, Syslog rfc5424, Syslog rfc3164.

All parsers must be defined in a parsers.conf file, not in the Fluent Bit global configuration file. The parsers file expose all parsers available that can be used by the Input plugins that are aware of this feature. A parsers file can have multiple entries like this:

[PARSER]
    Name        docker
    Format      json
    Time_Key    time
    Time_Format %Y-%m-%dT%H:%M:%S.%L
    Time_Keep   On

[PARSER]
    Name        syslog-rfc5424
    Format      regex
    Regex       ^\<(?<pri>[0-9]{1,5})\>1 (?<time>[^ ]+) (?<host>[^ ]+) (?<ident>[^ ]+) (?<pid>[-0-9]+) (?<msgid>[^ ]+) (?<extradata>(\[(.*)\]|-)) (?<message>.+)$
    Time_Key    time
    Time_Format %Y-%m-%dT%H:%M:%S.%L
    Time_Keep   On
    Types pid:integer

Filter in Kubernetes

Kubernetes Annotations

apiVersion: v1
kind: Pod
metadata:
  name: apache-logs
  labels:
    app: apache-logs
  annotations:
    # define parser
    fluentbit.io/parser: apache
    # exclude
    fluentbit.io/exclude: "true"
spec:
  containers:
  - name: apache
    image: edsiper/apache_logs

Workflow of Tail + Kubernetes Filter

[INPUT]
    Name    tail
    Tag     kube.*
    Path    /var/log/containers/*.log
    Parser  docker

[FILTER]
    Name             kubernetes
    Match            kube.*
    Kube_URL         https://kubernetes.default.svc:443
    Kube_CA_File     /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    Kube_Token_File  /var/run/secrets/kubernetes.io/serviceaccount/token
    Kube_Tag_Prefix  kube.var.log.containers.
    Merge_Log        On
    Merge_Log_Key    log_processed

Helm Charts and Yaml

apiVersion: v1
data:
  fluent-bit.conf: |
    [SERVICE]
        Flush           2
        Daemon          off
        Log_Level       info
        Parsers_File    parsers.conf
        Plugins_File    plugins.conf
        HTTP_Server     On
        HTTP_Listen     0.0.0.0
        HTTP_Port       2020

Modify Filter

The Modify Filter plugin allows you to change records using rules and conditions.

[INPUT]
    Name mem
    Tag  mem.local

[OUTPUT]
    Name  stdout
    Match *

[FILTER]
    Name modify
    Match *
    Add Service1 SOMEVALUE
    Add Service3 SOMEVALUE3
    Add Mem.total2 TOTALMEM2
    Rename Mem.free MEMFREE
    Rename Mem.used MEMUSED
    Rename Swap.total SWAPTOTAL
    Add Mem.total TOTALMEM

Conditionally Add and Remove

[INPUT]
    Name mem
    Tag  mem.local
    Interval_Sec 1

[FILTER]
    Name    modify
    Match   mem.*

    Condition Key_Does_Not_Exist cpustats
    Condition Key_Exists Mem.used

    Set cpustats UNKNOWN

[FILTER]
    Name    modify
    Match   mem.*

    Condition Key_Value_Does_Not_Equal cpustats KNOWN

    Add sourcetype memstats

[FILTER]
    Name    modify
    Match   mem.*

    Condition Key_Value_Equals cpustats UNKNOWN

    Remove_wildcard Mem
    Remove_wildcard Swap
    Add cpustats_more STILL_UNKNOWN

[OUTPUT]
    Name           stdout
    Match          *

Routing

There are two important concepts in Routing: Tag, Match

When the data is generated by the input plugins, it comes with a Tag (most of the time the Tag is configured manually), the Tag is a human-readable indicator that helps to identify the data source.
In order to define where the data should be routed, a Match rule must be specified in the output configuration.

# Consider the following configuration example that aims to deliver CPU metrics to an Elasticsearch database and Memory metrics to the standard output interface:
[INPUT]
    Name cpu
    Tag  my_cpu

[INPUT]
    Name mem
    Tag  my_mem

[OUTPUT]
    Name   es
    Match  my_cpu

[OUTPUT]
    Name   stdout
    Match  my_mem

Routing works automatically reading the Input Tags and the Output Match rules. If some data has a Tag that doesn’t match upon routing time, the data is deleted.
Routing with Wildcard: Routing is flexible enough to support wildcard in the Match pattern. The below example defines a common destination for both sources of data:

# The match rule is set to my_* which means it will match any Tag that starts with my_.
[INPUT]
    Name cpu
    Tag  my_cpu

[INPUT]
    Name mem
    Tag  my_mem

[OUTPUT]
    Name   stdout
    Match  my_*

Output Plugins

Elasticsearch

[INPUT]
    Name  cpu
    Tag   cpu

[OUTPUT]
    Name  es
    Match *
    Host  192.168.2.3
    Port  9200
    Index my_index
    Type  my_type

Demo

下面是一个 sidecar 模式部署的 flb 收集 nginx 日志的例子和 Yaml 文件。

大致的流程示意图： flb-sidecar

注意：下面挂载的共享日志目录是 /app/log/ ，而非默认的 /var/log/nginx/ 目录。

因为默认的 nginx 镜像会把 access.log -> stdout，所以不能直接挂载。这里测试的时候，是直接进入 nginx 的容器，把 access.log 手动 copy 到 /app/log/ 目录下，再观察 fluent-bit 的 Container 会不会把 access.log 的内容输出到 stdout 上。

---
# Source: default/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: zeed-flb-default
  namespace: default
  labels:
    app.kubernetes.io/name: default
    helm.sh/chart: default-0.1.0
    app.kubernetes.io/instance: zeed-flb
    app.kubernetes.io/version: "1.0"
    app.kubernetes.io/managed-by: Tiller
data:
  # Configuration files: server, input, filters and output
  # ======================================================
  fluent-bit.conf: |
    [SERVICE]
        Flush         5
        Log_Level     info
        Daemon        off
        Parsers_File  parsers.conf
        HTTP_Server   On
        HTTP_Listen   0.0.0.0
        HTTP_Port     2020

    [INPUT]
        Name dummy
        Tag  dummy.log

    [INPUT]
        Name tail
        Tag  nginx.access
        Parser nginx
        Path /app/log/nginx-access.log

    [INPUT]
        Name tail
        Tag  nginx.error
        Parser nginx
        Path /app/log/nginx-error.log

    [OUTPUT]
        Name  stdout
        Match *

    [OUTPUT]
        Name  es
        Match es.nginx.*
        Host  192.168.2.3
        Port  9200
        Index my_index
        Type  my_type


  parsers.conf: |
    [PARSER]
        Name   apache
        Format regex
        Regex  ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   apache2
        Format regex
        Regex  ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^ ]*) +\S*)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   apache_error
        Format regex
        Regex  ^\[[^ ]* (?<time>[^\]]*)\] \[(?<level>[^\]]*)\](?: \[pid (?<pid>[^\]]*)\])?( \[client (?<client>[^\]]*)\])? (?<message>.*)$

    [PARSER]
        Name   nginx
        Format regex
        Regex ^(?<remote>[^ ]*) (?<host>[^ ]*) (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name   json
        Format json
        Time_Key time
        Time_Format %d/%b/%Y:%H:%M:%S %z

    [PARSER]
        Name        docker
        Format      json
        Time_Key    time
        Time_Format %Y-%m-%dT%H:%M:%S.%L
        Time_Keep   On

    [PARSER]
        Name        syslog
        Format      regex
        Regex       ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
        Time_Key    time
        Time_Format %b %d %H:%M:%S

---
# Source: default/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: zeed-flb-default
  namespace: default
  labels:
    app.kubernetes.io/name: default
    helm.sh/chart: default-0.1.0
    app.kubernetes.io/instance: zeed-flb
    app.kubernetes.io/version: "1.0"
    app.kubernetes.io/managed-by: Tiller
spec:
  type: ClusterIP
  ports:
    - port: 80
      targetPort: http
      protocol: TCP
      name: http
  selector:
    app.kubernetes.io/name: default
    app.kubernetes.io/instance: zeed-flb

---
# Source: default/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: zeed-flb-default
  namespace: default
  labels:
    app.kubernetes.io/name: default
    helm.sh/chart: default-0.1.0
    app.kubernetes.io/instance: zeed-flb
    app.kubernetes.io/version: "1.0"
    app.kubernetes.io/managed-by: Tiller
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: default
      app.kubernetes.io/instance: zeed-flb
  template:
    metadata:
      labels:
        app.kubernetes.io/name: default
        app.kubernetes.io/instance: zeed-flb
    spec:
      containers:
        - name: default
          image: "nginx:stable"
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 80
              protocol: TCP
          livenessProbe:
            httpGet:
              path: /
              port: http
          readinessProbe:
            httpGet:
              path: /
              port: http
          resources:
            {}
          volumeMounts:
            - name: log-volume
              mountPath: /app/log/
        - name: default-fluentbit
          image: "fluent/fluent-bit:1.3"
          imagePullPolicy: IfNotPresent
          ports:
            - name: metrics
              containerPort: 2020
              protocol: TCP
          volumeMounts:
            - name: config-volume
              mountPath: /fluent-bit/etc/
            - name: log-volume
              mountPath: /app/log/
      volumes:
      - name: log-volume
        emptyDir: {}
      - name: config-volume
        configMap:
          name: zeed-flb-default

---
# Source: default/templates/ingress.yaml

文章目录