Skip to content

Commit

Permalink
Implement PoC pipeline for custom LogQL log source
Browse files Browse the repository at this point in the history
Extract out re-used code from backend, and use a modifier like approach
to control how the simplified "detection" logic works. Currently
supports string values, ORs as regular expressions, and basic fieldref
functionality.
  • Loading branch information
kelnage committed Jul 3, 2024
1 parent 0309734 commit 0109308
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 113 deletions.
84 changes: 10 additions & 74 deletions sigma/backends/loki/loki.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
from warnings import warn
from yaml import dump

from sigma.shared import sanitize_label_key, quote_string_value, join_or_values_re

Conditions = Union[
ConditionItem,
ConditionNOT,
Expand Down Expand Up @@ -150,38 +152,7 @@ def negate(self) -> DeferredQueryExpression:
return self

def finalize_expression(self) -> str:
# This makes the regex case insensitive if any values are SigmaStrings
# or if any of the regexes are case insensitive
# TODO: can we make this more precise?
case_insensitive = any(
(
isinstance(val, SigmaString)
and self.case_insensitive
and not isinstance(val, SigmaCasedString)
)
or (
isinstance(val, SigmaRegularExpression)
and val.regexp.startswith("(?i)")
)
for val in self.exprs
)
or_value = "|".join(
(
(
re.escape(str(val))
if isinstance(val, SigmaString)
else re.sub("^\\(\\?i\\)", "", val.regexp)
)
for val in self.exprs
)
)
if case_insensitive:
or_value = "(?i)" + or_value
if "`" in or_value:
or_value = '"' + SigmaRegularExpression(or_value).escape(('"',)) + '"'
else:
or_value = "`" + or_value + "`"
return f"{self.op} {or_value}"
return f"{self.op} {join_or_values_re(self.exprs, self.case_insensitive)}"


@dataclass
Expand Down Expand Up @@ -440,32 +411,6 @@ def select_log_stream(self, rule: SigmaRule) -> str:
# By default, bring back all log streams
return '{job=~".+"}'

def sanitize_label_key(self, key: str, isprefix: bool = True) -> str:
"""Implements the logic used by Loki to sanitize labels.
See: https://github.com/grafana/loki/blob/main/pkg/logql/log/util.go#L21"""
# pySigma treats null or empty fields as unbound expressions, rather than keys
if key is None or len(key) == 0: # pragma: no cover
return ""
key = key.strip()
if len(key) == 0:
return key
if isprefix and key[0] >= "0" and key[0] <= "9":
key = "_" + key
return "".join(
(
(
r
if (r >= "a" and r <= "z")
or (r >= "A" and r <= "Z")
or r == "_"
or (r >= "0" and r <= "9")
else "_"
)
for r in key
)
)

def partition_rule(
self, condition: Conditions, partitions: int
) -> List[Conditions]:
Expand Down Expand Up @@ -552,8 +497,10 @@ def convert_field_expression_to_line_filter(
# Could include field name if entries are logfmt and doesn't start with wildcard
regexp = expr.value.regexp
anchors = LogQLBackend.anchor_replace_pattern.match(expr.value.regexp)
if anchors and anchors.group("body") and (
anchors.group("start") or anchors.group("end")
if (
anchors
and anchors.group("body")
and (anchors.group("start") or anchors.group("end"))
):
regexp = (
anchors.group("ext") if anchors.group("ext") else ""
Expand Down Expand Up @@ -1109,24 +1056,13 @@ def convert_condition_field_eq_expansion(
# Loki has strict rules about field (label) names, so use their rules
def escape_and_quote_field(self, field_name: str) -> str:
"""Use Loki's sanitize function to ensure the field name is appropriately escaped."""
return self.sanitize_label_key(field_name)
return sanitize_label_key(field_name)

# If a string doesn't contain a tilde character, easier to use it to quote strings,
# otherwise we will default to using a double quote character, and escape the string
# appropriately
def convert_value_str(self, s: SigmaString, state: ConversionState) -> str:
"""By default, use the tilde character to quote fields, which needs limited escaping.
If the value contains a tilde character, use double quotes and apply more rigourous
escaping."""
quote = "`"
if any([c == quote for c in str(s)]):
quote = '"'
# If our string doesn't contain any tilde characters
if quote == "`":
converted = s.convert()
else:
converted = s.convert(escape_char="\\", add_escaped='"\\')
return quote + converted + quote
return quote_string_value(s)

# Swapping the meaning of "deferred" expressions so they appear at the start of a query,
# rather than the end (since this is the recommended approach for LogQL), and add in log
Expand Down Expand Up @@ -1192,7 +1128,7 @@ def finalize_query(
state.deferred.clear()
if rule.fields and len(rule.fields) > 0:
line_fmt_fields = " ".join(
"{{." + self.sanitize_label_key(field) + "}}" for field in rule.fields
"{{." + sanitize_label_key(field) + "}}" for field in rule.fields
)
query = query + f' | line_format "{line_fmt_fields}"'
# Select an appropriate source based on the logsource
Expand Down
2 changes: 2 additions & 0 deletions sigma/pipelines/loki/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .loki import (
LokiCustomAttributes,
SetCustomAttributeTransformation,
CustomLogSourceTransformation,
loki_grafana_logfmt,
loki_promtail_sysmon,
loki_okta_system_log,
Expand All @@ -9,6 +10,7 @@
__all__ = (
"LokiCustomAttributes",
"SetCustomAttributeTransformation",
"CustomLogSourceTransformation",
"loki_grafana_logfmt",
"loki_promtail_sysmon",
"loki_okta_system_log",
Expand Down
87 changes: 86 additions & 1 deletion sigma/pipelines/loki/loki.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from dataclasses import dataclass
from enum import Enum
from typing import Any, Union
from typing import Any, Dict, List, Union

from sigma.types import SigmaString
from sigma.rule import SigmaRule
from sigma.correlations import SigmaCorrelationRule
from sigma.exceptions import (
SigmaConfigurationError,
SigmaFeatureNotSupportedByBackendError,
)
from sigma.processing.conditions import LogsourceCondition
from sigma.processing.pipeline import ProcessingItem, ProcessingPipeline
from sigma.processing.transformations import (
Expand All @@ -12,6 +17,7 @@
AddFieldnamePrefixTransformation,
FieldMappingTransformation,
)
from sigma.shared import sanitize_label_key, quote_string_value, join_or_values_re


class LokiCustomAttributes(Enum):
Expand All @@ -36,9 +42,88 @@ def apply(
rule.custom_attributes[self.attribute] = self.value


def format_log_source_selector(field: str, value: Union[str, List[str]]) -> str:
"""Formats a string label name and either a single string or multiple strings into a valid LogQL
stream selector query. This currently assumes that the label values are case-sensitive.
"""
# TODO: replace log source placeholders
if isinstance(value, str):
# TODO: support regular expressions?
string = SigmaString(value)
return f"{sanitize_label_key(field)}={quote_string_value(string)}"
elif isinstance(value, list):
regex = join_or_values_re([SigmaString(s) for s in value], False)
return f"{sanitize_label_key(field)}=~{regex}"
raise SigmaConfigurationError(
f"unable to format selector {value} for field {field}"
)


@dataclass
class CustomLogSourceTransformation(Transformation):
"""Allow the definition of a log source selector using YAML structured data, including
referencing log source and/or detection fields from the rule"""

selection: Dict[str, Union[str, List[str]]]

def apply(
self, pipeline: ProcessingPipeline, rule: Union[SigmaRule, SigmaCorrelationRule]
):
super().apply(pipeline, rule)
if isinstance(rule, SigmaRule):
selectors: List[str] = []
refs: Dict[str, str] = {}
for field, value in self.selection.items():
if field.endswith("|fieldref"):
if isinstance(value, list):
raise SigmaConfigurationError(
f"fieldref custom log source transformation {field} "
"can only refer to a single field"
)
else:
refs[field.removesuffix("|fieldref")] = value
else:
selectors.append(format_log_source_selector(field, value))
if len(refs) > 0:
plain = [
detection.to_plain()
for detection in rule.detection.detections.values()
]
field_values: list[dict[str, str | int | None]] = [
d for d in plain if isinstance(d, dict)
]
if len(field_values) > 0:
for label, field_name in refs.items():
values: list[Union[str, int, None]] = []
for mapping in field_values:
if (
field_name in mapping
and mapping[field_name] is not None
):
values.append(mapping[field_name])
if len(values) == 1:
selectors.append(
format_log_source_selector(label, str(values[0]))
)
elif len(values) > 1:
selectors.append(
format_log_source_selector(
label, [str(v) for v in values]
)
)
rule.custom_attributes[LokiCustomAttributes.LOGSOURCE_SELECTION.value] = (
"{" + ",".join(selectors) + "}"
)
else:
raise SigmaFeatureNotSupportedByBackendError(
"custom log source transforms are not supported for Correlation rules"
)


# Update pySigma transformations to include the above
# mypy type: ignore required due to incorrect type annotation on the transformations dict
transformations["set_custom_attribute"] = SetCustomAttributeTransformation # type: ignore
transformations["set_custom_log_source"] = CustomLogSourceTransformation # type: ignore


def loki_grafana_logfmt() -> ProcessingPipeline:
Expand Down
79 changes: 79 additions & 0 deletions sigma/shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import re
from typing import List, Union
from sigma.types import SigmaCasedString, SigmaString, SigmaRegularExpression


def sanitize_label_key(key: str, isprefix: bool = True) -> str:
"""Implements the logic used by Loki to sanitize labels.
See: https://github.com/grafana/loki/blob/main/pkg/logql/log/util.go#L21"""
# pySigma treats null or empty fields as unbound expressions, rather than keys
if key is None or len(key) == 0: # pragma: no cover
return ""
key = key.strip()
if len(key) == 0:
return key
if isprefix and key[0] >= "0" and key[0] <= "9":
key = "_" + key
return "".join(
(
(
r
if (r >= "a" and r <= "z")
or (r >= "A" and r <= "Z")
or r == "_"
or (r >= "0" and r <= "9")
else "_"
)
for r in key
)
)


def quote_string_value(s: SigmaString) -> str:
"""By default, use the tilde character to quote fields, which needs limited escaping.
If the value contains a tilde character, use double quotes and apply more rigourous
escaping."""
quote = "`"
if any([c == quote for c in str(s)]):
quote = '"'
# If our string doesn't contain any tilde characters
if quote == "`":
converted = s.convert()
else:
converted = s.convert(escape_char="\\", add_escaped='"\\')
return quote + converted + quote


def join_or_values_re(
exprs: List[Union[SigmaString, SigmaRegularExpression]], case_insensitive: bool
) -> str:
# This makes the regex case insensitive if any values are SigmaStrings
# or if any of the regexes are case insensitive
# TODO: can we make this more precise?
case_insensitive = any(
(
isinstance(val, SigmaString)
and case_insensitive
and not isinstance(val, SigmaCasedString)
)
or (isinstance(val, SigmaRegularExpression) and val.regexp.startswith("(?i)"))
for val in exprs
)
or_value = "|".join(
(
(
re.escape(str(val))
if isinstance(val, SigmaString)
else re.sub("^\\(\\?i\\)", "", val.regexp)
)
for val in exprs
)
)
if case_insensitive:
or_value = "(?i)" + or_value
if "`" in or_value:
or_value = '"' + SigmaRegularExpression(or_value).escape(('"',)) + '"'
else:
or_value = "`" + or_value + "`"
return or_value
4 changes: 2 additions & 2 deletions tests/test_backend_loki_fieldref.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def test_loki_field_ref_json_multi_selection(loki_backend: LogQLBackend):
)
== [
'{job=~"eventlog|winlog|windows|fluentbit.*"} | json | field2=~`(?i)^Something$`'
'| label_format match_0=`{{ if eq .field1 .fieldA }}true{{ else }}false{{ end }}` '
'| match_0=`true`'
"| label_format match_0=`{{ if eq .field1 .fieldA }}true{{ else }}false{{ end }}` "
"| match_0=`true`"
]
)

Expand Down
Loading

0 comments on commit 0109308

Please sign in to comment.