python - 使用 PyParsing 解析系统日志

标签 python parsing pyparsing

如何使用 pyparsing 模块解析具有多种格式日志的日志文件。以下是我正在使用的代码。

# -*- coding: utf-8 -*-
"""

"""

import pandas as pd

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex

from time import strftime

class Parser(object):
  def __init__(self):
    ints = Word(nums)

    # priority
   # priority = Suppress("<") + ints + Suppress(">")

    # timestamp
    month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
    day   = ints
    hour  = Combine(ints + ":" + ints + ":" + ints)

    timestamp = month + day + hour

    # hostname
    hostname = Word(alphas + nums + "_" + "-" + ".")

    # appname
    appname = Word(alphas + "/" + "-" + "_" + "." + "(" + ")") + (Suppress("[") + ints + Suppress("]")) | (Word(alphas + "/" + "-" + "_" + ".")  + Word (":")) 

    # message
    message = Regex(".*")

    # pattern build
    self.__pattern = timestamp + hostname + appname + message


    def parse(self, line):
    parsed = self.__pattern.parseString(line)

    payload              = {}
    #payload["priority"]  = parsed[0]
    payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S")
    payload["hostname"]  = parsed[3]
    payload["appname"]   = parsed[4]
    payload["pid"]       = parsed[5]
    payload["message"]   = parsed[6]


    return payload


def main():

    parser = Parser()

    with open('./messages.log') as syslogFile:

        list1 = [] 
        for line in syslogFile:
            fields = parser.parse(line)
            list1.append(fields)

        return list1


if __name__ == "__main__":

    main()

以下是需要解析不同日志的示例:

Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar  7 21:23:22 avas dccifd[6191]: missing message body
Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar  8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar  8 15:18:40 avas: last message repeated 11 times

请建议我该怎么办?

最佳答案

为了处理这个新行,我使用 pyparsingOptional 类将 appname 部分标记为可选,并拆分出尾随的“:”。在下面的代码中,我还做了一些调整,一些用于解析时数据转换的解析操作,以及一些结果名称以简化在 parse() 方法中创建结果字典。

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional

from datetime import datetime

class Parser(object):
    # log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900
    ASSUMED_YEAR = '2016'

    def __init__(self):
        ints = Word(nums)

        # priority
       # priority = Suppress("<") + ints + Suppress(">")

        # timestamp
        month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
        day   = ints
        hour  = Combine(ints + ":" + ints + ":" + ints)

        timestamp = month + day + hour
        # a parse action will convert this timestamp to a datetime
        timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S'))

        # hostname
        hostname = Word(alphas + nums + "_-.")

        # appname
        appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname"))
        appname.setName("appname")

        # message
        message = Regex(".*")

        # pattern build
        # (add results names to make it easier to access parsed fields)
        self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message")

    def parse(self, line):
        parsed = self._pattern.parseString(line)
        # fill in keys that might not have been found in the input string
        # (this could have been done in a parse action too, then this method would
        # have just been a two-liner)
        for key in 'appname pid'.split():
            if key not in parsed:
                parsed[key] = ''
        return parsed.asDict()

使用 runTests() 根据特定的测试输入测试解析器:

pattern = Parser()._pattern

tests = """\
Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar  7 21:23:22 avas dccifd[6191]: missing message body
Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar  8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar  8 15:18:40 avas: last message repeated 11 times"""

pattern.runTests(tests)

给予:

Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'
- pid: '11165'
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16)


Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'
- pid: '11240'
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55)


Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.']
- appname: 'clamd'
- hostname: 'avas'
- message: 'SelfCheck: Database status OK.'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51)


Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)']
- appname: 'clamd'
- hostname: 'avas'
- message: 'Database correctly reloaded (20400 viruses)'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2)


Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246']
- appname: 'dccd'
- hostname: 'avas'
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35)


Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'write(MTA socket,4): Broken pipe'
- pid: '9933'
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57)


Mar  7 21:23:22 avas dccifd[6191]: missing message body
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'missing message body'
- pid: '6191'
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22)


Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'
- pid: '12045'
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17)


Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'continue not asking DCC 17 seconds after failure'
- pid: '23069'
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16)


Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11)


Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`']
- appname: 'dccd'
- hostname: 'avas'
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'
- pid: '145'
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7)


Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2']
- appname: 'kernel'
- hostname: 'avas'
- message: 'i810_audio: Connection 0 with codec id 2'
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18)


Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577']
- appname: 'dccd'
- hostname: 'avas'
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'
- pid: '3004'
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13)


Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)']
- appname: 'sshd(pam_unix)'
- hostname: 'avas'
- message: 'session opened for user tom by (uid=35567)'
- pid: '21839'
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7)


Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window']
- appname: 'dccd'
- hostname: 'avas'
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4)


Mar  8 16:05:26 avas arpwatch: listening on eth0
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0']
- appname: 'arpwatch'
- hostname: 'avas'
- message: 'listening on eth0'
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26)


Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6)


Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10)

Mar  8 15:18:40 avas: last message repeated 11 times
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times']
- hostname: 'avas'
- message: 'last message repeated 11 times'
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40)

或者使用Parser类的parse()方法:

from pprint import pprint
for t in tests.splitlines():
    pprint(Parser().parse(t))
    print()

给出:

{'appname': 'clamd',
 'hostname': 'avas',
 'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: '
        'Worm.Mydoom.F FOUND ',
 'pid': '11165',
 'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: '
        'Worm.SomeFool.Gen-1 FOUND ',
 'pid': '11240',
 'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': 'SelfCheck: Database status OK.',
 'pid': '27173',
 'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': 'Database correctly reloaded (20400 viruses) ',
 'pid': '27173',
 'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246',
 'pid': '13284',
 'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'write(MTA socket,4): Broken pipe',
 'pid': '9933',
 'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'missing message body',
 'pid': '6191',
 'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
        '10.0.0.253#53',
 'pid': '12045',
 'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'continue not asking DCC 17 seconds after failure',
 'pid': '23069',
 'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'client 127.0.0.1#55524: query: '
        '23.68.27.142.sa-trusted.bondedsender.org IN TXT',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L '
        'info,local5.notice -L error,local5.err`',
 'pid': '145',
 'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)}

{'appname': 'kernel',
 'hostname': 'avas',
 'message': 'i810_audio: Connection 0 with codec id 2',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '"packet length 44 too small for REPORT" sent to client 1 at '
        '194.63.250.215,47577',
 'pid': '3004',
 'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)}

{'appname': 'sshd(pam_unix)',
 'hostname': 'avas',
 'message': 'session opened for user tom by (uid=35567)',
 'pid': '21839',
 'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window',
 'pid': '13284',
 'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)}

{'appname': 'arpwatch',
 'hostname': 'avas',
 'message': 'listening on eth0',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
        '192.75.26.21#53',
 'pid': '6986',
 'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX',
 'pid': '6986',
 'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)}

{'appname': '',
 'hostname': 'avas',
 'message': 'last message repeated 11 times',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)}

关于python - 使用 PyParsing 解析系统日志,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41137742/

相关文章:

android - 如何在Android中加载不同国家的特定文件

python - 使用 PyParsing 的两个标记之间的字符串

python - 将一次 "for"循环迭代的结果传递到下一次迭代?

python - plotly :如何设计 plotly 图的样式,以便它不会显示缺失日期的间隙?

python - 将文本文件写入管道

android - 如何在水平 ListView 中获取动态数据

使用解析器组合器解析具有函数应用的表达式语法(左递归)

python - 这种技术对于在 pyparsing 中构造 ParseResults 是否可以接受?

python - pyparsing - 同一条语句的第二次执行抛出异常

python - 使用 describe 和 dtypes 创建自定义函数时出错