从syslog日志文件中快速提取时间范围?

我有一个标准的syslog格式的日志文件。 它看起来像这样,除了每秒数百行:

Jan 11 07:48:46 blahblahblah... Jan 11 07:49:00 blahblahblah... Jan 11 07:50:13 blahblahblah... Jan 11 07:51:22 blahblahblah... Jan 11 07:58:04 blahblahblah... 

它不会在午夜时分推出,但它永远不会超过两天。

我经常不得不从这个文件中提取一个时间片。 我想写一个通用脚本,我可以这样调用:

 $ timegrep 22:30-02:00 /logs/something.log 

从22:30开始,直到午夜的边界,直到第二天凌晨2点。

有几个警告:

  • 我不想打扰命令行上的date,只是时间。 该计划应该足够聪明,以弄清楚。
  • 日志格式不包括年份,所以它应该根据当年猜测,但在元旦周围做正确的事情。
  • 我希望它快 – 它应该使用这样一个事实,即行是为了在文件中寻找并使用二分查找。

在我花一大笔时间写这个之前,它已经存在了吗?

更新:我已经用更新后的版本replace了原有的代码,并有许多改进。 我们称之为(实际?)α质量。

这个版本包括:

  • 命令行选项处理
  • 命令行date格式validation
  • 一些try
  • 行阅读转移到一个function

原文:

好吧,你知道什么? “寻找”,你会发现! 这是一个Python程序,在文件中寻找并使用一个或多或less的二进制search。 这比其他人写的AWK脚本快得多。

(预?)α质量。 它应该try块和inputvalidation和大量的testing,无疑可以更Pythonic。 但是这是为了您的娱乐。 哦,它是为Python 2.6编写的。

新代码:

 #!/usr/bin/env python # -*- coding: utf-8 -*- # timegrep.py by Dennis Williamson 20100113 # in response to http://serverfault.com/questions/101744/fast-extraction-of-a-time-range-from-syslog-logfile # thanks to serverfault user http://serverfault.com/users/1545/mike # for the inspiration # Perform a binary search through a log file to find a range of times # and print the corresponding lines # tested with Python 2.6 # TODO: Make sure that it works if the seek falls in the middle of # the first or last line # TODO: Make sure it's not blind to a line where the sync read falls # exactly at the beginning of the line being searched for and # then gets skipped by the second read # TODO: accept arbitrary date # done: add -l long and -s short options # done: test time format version = "0.01a" import os, sys from stat import * from datetime import date, datetime import re from optparse import OptionParser # Function to read lines from file and extract the date and time def getdata(): """Read a line from a file Return a tuple containing: the date/time in a format such as 'Jan 15 20:14:01' the line itself The last colon and seconds are optional and not handled specially """ try: line = handle.readline(bufsize) except: print("File I/O Error") exit(1) if line == '': print("EOF reached") exit(1) if line[-1] == '\n': line = line.rstrip('\n') else: if len(line) >= bufsize: print("Line length exceeds buffer size") else: print("Missing newline") exit(1) words = line.split(' ') if len(words) >= 3: linedate = words[0] + " " + words[1] + " " + words[2] else: linedate = '' return (linedate, line) # End function getdata() # Set up option handling parser = OptionParser(version = "%prog " + version) parser.usage = "\n\t%prog [options] start-time end-time filename\n\n\ \twhere times are in the form hh:mm[:ss]" parser.description = "Search a log file for a range of times occurring yesterday \ and/or today using the current time to intelligently select the start and end. \ A date may be specified instead. Seconds are optional in time arguments." parser.add_option("-d", "--date", action = "store", dest = "date", default = "", help = "NOT YET IMPLEMENTED. Use the supplied date instead of today.") parser.add_option("-l", "--long", action = "store_true", dest = "longout", default = False, help = "Span the longest possible time range.") parser.add_option("-s", "--short", action = "store_true", dest = "shortout", default = False, help = "Span the shortest possible time range.") parser.add_option("-D", "--debug", action = "store", dest = "debug", default = 0, type = "int", help = "Output debugging information.\t\t\t\t\tNone (default) = %default, Some = 1, More = 2") (options, args) = parser.parse_args() if not 0 <= options.debug <= 2: parser.error("debug level out of range") else: debug = options.debug # 1 = print some debug output, 2 = print a little more, 0 = none if options.longout and options.shortout: parser.error("options -l and -s are mutually exclusive") if options.date: parser.error("date option not yet implemented") if len(args) != 3: parser.error("invalid number of arguments") start = args[0] end = args[1] file = args[2] # test for times to be properly formatted, allow hh:mm or hh:mm:ss p = re.compile(r'(^[2][0-3]|[0-1][0-9]):[0-5][0-9](:[0-5][0-9])?$') if not p.match(start) or not p.match(end): print("Invalid time specification") exit(1) # Determine Time Range yesterday = date.fromordinal(date.today().toordinal()-1).strftime("%b %d") today = datetime.now().strftime("%b %d") now = datetime.now().strftime("%R") if start > now or start > end or options.longout or options.shortout: searchstart = yesterday else: searchstart = today if (end > start > now and not options.longout) or options.shortout: searchend = yesterday else: searchend = today searchstart = searchstart + " " + start searchend = searchend + " " + end try: handle = open(file,'r') except: print("File Open Error") exit(1) # Set some initial values bufsize = 4096 # handle long lines, but put a limit them rewind = 100 # arbitrary, the optimal value is highly dependent on the structure of the file limit = 75 # arbitrary, allow for a VERY large file, but stop it if it runs away count = 0 size = os.stat(file)[ST_SIZE] beginrange = 0 midrange = size / 2 oldmidrange = midrange endrange = size linedate = '' pos1 = pos2 = 0 if debug > 0: print("File: '{0}' Size: {1} Today: '{2}' Now: {3} Start: '{4}' End: '{5}'".format(file, size, today, now, searchstart, searchend)) # Seek using binary search while pos1 != endrange and oldmidrange != 0 and linedate != searchstart: handle.seek(midrange) linedate, line = getdata() # sync to line ending pos1 = handle.tell() if midrange > 0: # if not BOF, discard first read if debug > 1: print("...partial: (len: {0}) '{1}'".format((len(line)), line)) linedate, line = getdata() pos2 = handle.tell() count += 1 if debug > 0: print("#{0} Beg: {1} Mid: {2} End: {3} P1: {4} P2: {5} Timestamp: '{6}'".format(count, beginrange, midrange, endrange, pos1, pos2, linedate)) if searchstart > linedate: beginrange = midrange else: endrange = midrange oldmidrange = midrange midrange = (beginrange + endrange) / 2 if count > limit: print("ERROR: ITERATION LIMIT EXCEEDED") exit(1) if debug > 0: print("...stopping: '{0}'".format(line)) # Rewind a bit to make sure we didn't miss any seek = oldmidrange while linedate >= searchstart and seek > 0: if seek < rewind: seek = 0 else: seek = seek - rewind if debug > 0: print("...rewinding") handle.seek(seek) linedate, line = getdata() # sync to line ending if debug > 1: print("...junk: '{0}'".format(line)) linedate, line = getdata() if debug > 0: print("...comparing: '{0}'".format(linedate)) # Scan forward while linedate < searchstart: if debug > 0: print("...skipping: '{0}'".format(linedate)) linedate, line = getdata() if debug > 0: print("...found: '{0}'".format(line)) if debug > 0: print("Beg: {0} Mid: {1} End: {2} P1: {3} P2: {4} Timestamp: '{5}'".format(beginrange, midrange, endrange, pos1, pos2, linedate)) # Now that the preliminaries are out of the way, we just loop, # reading lines and printing them until they are # beyond the end of the range we want while linedate <= searchend: print line linedate, line = getdata() if debug > 0: print("Start: '{0}' End: '{1}'".format(searchstart, searchend)) handle.close() 

从网上的快速search,有一些东西是基于关键字(如FIRE或类似的:),但从文件中提取date范围的东西。

做你的build议似乎并不困难:

  1. search开始时间。
  2. 打印出该行。
  3. 如果结束时间<开始时间,行的date>结束和<开始,然后停止。
  4. 如果结束时间>开始时间,并且行的date>结束,请停止。

看起来很简单,如果你不介意Ruby,我可以为你写。

这将根据它们与当前时间(“现在”)的关系打印开始时间和结束时间之间的条目范围。

用法:

 timegrep [-l] start end filename 

例:

 $ timegrep 18:47 03:22 /some/log/file 

-l (长)选项会导致最长的输出。 如果开始时间的小时和分钟值小于结束时间和现在时间,则开始时间将被解释为昨天。 如果开始时间和结束时间HH:MM值都大于“now”,则结束时间将被解释为今天。

假设“现在”是“1月11日19:00”,这是如何解释各种示例的开始和结束时间(除了注释之外没有-l ):

开始结束范围开始范围结束
 19:01 23:59 1月10日1月10日
 19:01 00:00 1月10日1月11日
 1月11日00:00 18:59 1月11日
 18:59 18:58 1月10日1月10日
 19:01 23:59 1月10日1月11日#-l
 00:00 18:59 1月10日1月11日#-l
 18:59 19:01 1月10日1月11日#-l

几乎所有的脚本都被设置。 最后两行完成所有的工作。

警告:没有参数validation或错误检查完成。 边缘案件还没有经过彻底的testing。 这是用gawk编写的其他版本的AWK可能会引起注意的。

 #!/usr/bin/awk -f BEGIN { arg=1 if ( ARGV[arg] == "-l" ) { long = 1 ARGV[arg++] = "" } start = ARGV[arg] ARGV[arg++] = "" end = ARGV[arg] ARGV[arg++] = "" yesterday = strftime("%b %d", mktime(strftime("%Y %m %d -24 00 00"))) today = strftime("%b %d") now = strftime("%R") if ( start > now || start > end || long ) startdate = yesterday else startdate = today if ( end > now && end > start && start > now && ! long ) enddate = yesterday else enddate = today fi startdate = startdate " " start enddate = enddate " " end } $1 " " $2 " " $3 > enddate {exit} $1 " " $2 " " $3 >= startdate {print} 

我认为AWK在search文件方面非常高效。 我不认为其他任何事情都不会在search未索引的文本文件时更快。

一个应用二进制search的C ++程序 – 它需要一些简单的修改(即调用strptime)来处理文本date。

http://gitorious.org/bs_grep/

我以前的版本支持文本date,但是对于我们的日志文件来说,它仍然太慢了。 分析表明,超过90%的时间花在了strptime中,所以我们只是修改了日志格式以包含数字unix时间戳。

尽pipe这个答案太晚了,但对一些人来说可能是有益的。

我已经将@Dennis Williamson的代码转换成可以用于其他python的Python类。

我已经添加了对多个date支持的支持。

 import os from stat import * from datetime import date, datetime import re # @TODO Support for rotated log files - currently using the current year for 'Jan 01' dates. class LogFileTimeParser(object): """ Extracts parts of a log file based on a start and enddate Uses binary search logic to speed up searching Common usage: validate log files during testing Faster than awk parsing for big log files """ version = "0.01a" # Set some initial values BUF_SIZE = 4096 # self.handle long lines, but put a limit to them REWIND = 100 # arbitrary, the optimal value is highly dependent on the structure of the file LIMIT = 75 # arbitrary, allow for a VERY large file, but stop it if it runs away line_date = '' line = None opened_file = None @staticmethod def parse_date(text, validate=True): # Supports Aug 16 14:59:01 , 2016-08-16 09:23:09 Jun 1 2005 1:33:06PM (with or without seconds, miliseconds) for fmt in ('%Y-%m-%d %H:%M:%S %f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', '%b %d %H:%M:%S %f', '%b %d %H:%M', '%b %d %H:%M:%S', '%b %d %Y %H:%M:%S %f', '%b %d %Y %H:%M', '%b %d %Y %H:%M:%S', '%b %d %Y %I:%M:%S%p', '%b %d %Y %I:%M%p', '%b %d %Y %I:%M:%S%p %f'): try: if fmt in ['%b %d %H:%M:%S %f', '%b %d %H:%M', '%b %d %H:%M:%S']: return datetime.strptime(text, fmt).replace(datetime.now().year) return datetime.strptime(text, fmt) except ValueError: pass if validate: raise ValueError("No valid date format found for '{0}'".format(text)) else: # Cannot use NoneType to compare datetimes. Using minimum instead return datetime.min # Function to read lines from file and extract the date and time def read_lines(self): """ Read a line from a file Return a tuple containing: the date/time in a format supported in parse_date om the line itself """ try: self.line = self.opened_file.readline(self.BUF_SIZE) except: raise IOError("File I/O Error") if self.line == '': raise EOFError("EOF reached") # Remove \n from read lines. if self.line[-1] == '\n': self.line = self.line.rstrip('\n') else: if len(self.line) >= self.BUF_SIZE: raise ValueError("Line length exceeds buffer size") else: raise ValueError("Missing newline") words = self.line.split(' ') # This results into Jan 1 01:01:01 000000 or 1970-01-01 01:01:01 000000 if len(words) >= 3: self.line_date = self.parse_date(words[0] + " " + words[1] + " " + words[2],False) else: self.line_date = self.parse_date('', False) return self.line_date, self.line def get_lines_between_timestamps(self, start, end, path_to_file, debug=False): # Set some initial values count = 0 size = os.stat(path_to_file)[ST_SIZE] begin_range = 0 mid_range = size / 2 old_mid_range = mid_range end_range = size pos1 = pos2 = 0 # If only hours are supplied # test for times to be properly formatted, allow hh:mm or hh:mm:ss p = re.compile(r'(^[2][0-3]|[0-1][0-9]):[0-5][0-9](:[0-5][0-9])?$') if p.match(start) or p.match(end): # Determine Time Range yesterday = date.fromordinal(date.today().toordinal() - 1).strftime("%Y-%m-%d") today = datetime.now().strftime("%Y-%m-%d") now = datetime.now().strftime("%R") if start > now or start > end: search_start = yesterday else: search_start = today if end > start > now: search_end = yesterday else: search_end = today search_start = self.parse_date(search_start + " " + start) search_end = self.parse_date(search_end + " " + end) else: # Set dates search_start = self.parse_date(start) search_end = self.parse_date(end) try: self.opened_file = open(path_to_file, 'r') except: raise IOError("File Open Error") if debug: print("File: '{0}' Size: {1} Start: '{2}' End: '{3}'" .format(path_to_file, size, search_start, search_end)) # Seek using binary search -- ONLY WORKS ON FILES WHO ARE SORTED BY DATES (should be true for log files) try: while pos1 != end_range and old_mid_range != 0 and self.line_date != search_start: self.opened_file.seek(mid_range) # sync to self.line ending self.line_date, self.line = self.read_lines() pos1 = self.opened_file.tell() # if not beginning of file, discard first read if mid_range > 0: if debug: print("...partial: (len: {0}) '{1}'".format((len(self.line)), self.line)) self.line_date, self.line = self.read_lines() pos2 = self.opened_file.tell() count += 1 if debug: print("#{0} Beginning: {1} Mid: {2} End: {3} P1: {4} P2: {5} Timestamp: '{6}'". format(count, begin_range, mid_range, end_range, pos1, pos2, self.line_date)) if search_start > self.line_date: begin_range = mid_range else: end_range = mid_range old_mid_range = mid_range mid_range = (begin_range + end_range) / 2 if count > self.LIMIT: raise IndexError("ERROR: ITERATION LIMIT EXCEEDED") if debug: print("...stopping: '{0}'".format(self.line)) # Rewind a bit to make sure we didn't miss any seek = old_mid_range while self.line_date >= search_start and seek > 0: if seek < self.REWIND: seek = 0 else: seek -= self.REWIND if debug: print("...rewinding") self.opened_file.seek(seek) # sync to self.line ending self.line_date, self.line = self.read_lines() if debug: print("...junk: '{0}'".format(self.line)) self.line_date, self.line = self.read_lines() if debug: print("...comparing: '{0}'".format(self.line_date)) # Scan forward while self.line_date < search_start: if debug: print("...skipping: '{0}'".format(self.line_date)) self.line_date, self.line = self.read_lines() if debug: print("...found: '{0}'".format(self.line)) if debug: print("Beginning: {0} Mid: {1} End: {2} P1: {3} P2: {4} Timestamp: '{5}'". format(begin_range, mid_range, end_range, pos1, pos2, self.line_date)) # Now that the preliminaries are out of the way, we just loop, # reading lines and printing them until they are beyond the end of the range we want while self.line_date <= search_end: # Exclude our 'Nonetype' values if not self.line_date == datetime.min: print self.line self.line_date, self.line = self.read_lines() if debug: print("Start: '{0}' End: '{1}'".format(search_start, search_end)) self.opened_file.close() # Do not display EOFErrors: except EOFError as e: pass