#!/usr/local/bin/ruby -w

# Log file parsers
class Error	
	attr_reader :exception, :line
	def initialize(e, line)	
		@exception = e
		@line = line
	end
end
class GenericLogReader
	DEBUG=false
	attr_accessor :includes_duration, :queries, :errors
	attr_reader :time_to_parse
	def initialize(filename, line_parser_name, accumulator_name)
		@filename = filename
		@line_parser_name = line_parser_name
		@accumulator_name = accumulator_name
		@includes_duration = false
		@queries, @errors = [], []
	end
	def parse
		start = Time.new
		a = Object.const_get(@accumulator_name).new
		puts "Using #{@accumulator_name}" if DEBUG
		File.foreach(@filename) {|text|
			begin 
				line = Object.const_get(@line_parser_name).new(text)
				next if !line.recognized
				if line.is_new_query
					puts "New query" if DEBUG
					a.new_query_start(line)
				elsif line.is_continuation
					puts "Continuation" if DEBUG
					a.query_continuation(line)
				elsif line.is_duration_line
					puts "Duration" if DEBUG
					@includes_duration = true
					a.set_duration(line)
				end
			rescue StandardError => e
				puts "FOOOOOOOOOOOOO #{e.backtrace}"
				@errors << Error.new(e,line)
			end
		}
		@time_to_parse = Time.new - start
		@queries = a.queries
	end
	def normalize
		@queries.each {|q| q.normalize }
	end
	def unique_queries
		uniq = []
		@queries.each {|x| uniq << x.text if !uniq.include?(x.text) }
		uniq.size
	end
end

class MySQLLogLine
	DISCARD = Regexp.new("(^Time )|(^Tcp)|( Quit )|( USE )|(\d{1,5} Connect)")
	START_QUERY = Regexp.new('\d{1,5} Query')
	attr_reader :text, :is_new_query, :recognized
	def initialize(text)
		@recognized = true
		@is_new_query = false
		if DISCARD.match(text) != nil
			@recognized = false
			return
		end
		@text = text
		@is_new_query = START_QUERY.match(@text) != nil
	end
	def is_continuation
		@recognized && /^    /.match(@text) != nil
	end	
	def is_duration_line
		false
	end	
	def parse_query_segment
		if @is_new_query
			tmp = START_QUERY.match(@text.strip)
			raise StandardError.new("PQA identified a line as the start of a new query, but then was unable to match it with the START_QUERY Regex. #{BUG_URL_STRING}") if tmp == nil
			return tmp.post_match.strip
		end
		@text.strip.chomp
	end
	def to_s
		@text
	end
end

class MySQLAccumulator
	attr_reader :queries
	def initialize
		@current = nil
		@queries = []
	end
	def new_query_start(line)
		@queries << @current if !@current.nil?
		@current = Query.new(line.parse_query_segment)
	end
	def query_continuation(line)
		@current.append(line.parse_query_segment) if !@current.nil?
	end
end


class SyslogLine
	DURATION_IN_MILLISECONDS=Regexp.new("ms$")
	STARTS_WITH_BRACKET = Regexp.new("^\\[")
	POSTGRES = Regexp.new(" postgres\\[")
	QUERY_STARTER = Regexp.new("(LOG|DEBUG):[\s]*(query|statement)")
	STATUS_LINE = Regexp.new("(LOG|DEBUG):[\s]*(connection|received)")
	DURATION_LINE = Regexp.new(":[\s]*duration:")
	attr_accessor :connection_id, :recognized
	def initialize(data)
		@recognized = false
		return if POSTGRES.match(data).nil?
		return if data =~ /begin/i || data =~ /VACUUM/i
		return if !STATUS_LINE.match(data).nil?

		@text = POSTGRES.match(data).post_match.strip
		return if @text == nil
		@text = @text.split("\:")[1,1000].join(":").strip
		@recognized = !STARTS_WITH_BRACKET.match(@text).nil?
		return if !@recognized

		@right_bracket = @text.index("]")
		if @text[2, @right_bracket]["-"]
			@connection_id = @text[1,@text.index("-")-1]
		else
			@connection_id = @text[1,@right_bracket-1]
		end
	end
	def is_new_query
		!QUERY_STARTER.match(@text).nil?
	end
	def is_continuation
		@recognized && !is_new_query && !is_duration_line
	end
	def is_duration_line
		@recognized && !DURATION_LINE.match(@text).nil?
	end
	def duration_in_ms
		DURATION_IN_MILLISECONDS.match(@text)
	end
	def parse_duration_segment		
		tmp = @text.split(DURATION_LINE)[1].strip
		tmp = tmp.split(" ")[0]
		time = tmp.strip.to_f
		duration_in_ms ? (time / 1000.0) : time
	end
	def parse_query_segment
		tmp = ""
		if is_new_query
			tmp = QUERY_STARTER.match(@text).post_match[2,1000]
		else
			tmp = @text[@right_bracket+2,10000]
		end
		return "" if tmp == nil
		tmp.gsub!(/\^I/, "")
		return "" if tmp == nil
		tmp.chomp
	end
	def to_s
		@text
	end
end

class SyslogAccumulator
	DEBUG=false
	attr_reader :queries
	def initialize
		@queries = []
		@working = {}
	end
	def new_query_start(line)
		if @working.has_key?(line.connection_id)
			close_out(line.connection_id)
			puts "Closed out a query due to a new query starting, so far there are #{@queries.size}" if DEBUG
		end
		@working[line.connection_id] = Query.new(line.parse_query_segment)
		puts "Started a query: #{line.parse_query_segment}" if DEBUG
	end
	def query_continuation(line)
		if @working.has_key?(line.connection_id)
			@working[line.connection_id].append(line.parse_query_segment)
		end
	end
	def set_duration(line)
		if @working.has_key?(line.connection_id)
			@working[line.connection_id].duration = line.parse_duration_segment
			close_out(line.connection_id)
			puts "Closed out a query due to a duration (#{line.parse_duration_segment}), so far there are #{@queries.size}" if DEBUG
		end
	end
	def close_out(id)
			@queries << @working[id]
			@working.delete(id)
	end
end

class PostgresLogLine
	START_QUERY_LINE=Regexp.new("^LOG:[\s]*(query|statement):")
	DURATION_LINE=Regexp.new("^LOG:[\s]*duration:")
	STARTS_WITH_DATE=Regexp.new("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ")
	STARTS_WITH_PID=Regexp.new('\[\d{1,5}\]')
	attr_reader :text, :is_new_query, :recognized
	def initialize(text)	
		@is_new_query = false
		@text = nil
		@recognized = false
		return if text =~ /begin/i || text =~ /VACUUM/i

		@recognized = true
		@text = STARTS_WITH_DATE.match(text) ? text.split(" ")[2..-1].join(" ").strip : text
		@text = STARTS_WITH_PID.match(@text) ? STARTS_WITH_PID.match(@text).post_match.strip : @text
		@is_new_query = START_QUERY_LINE.match(@text) != nil && @text.split(/LOG:\s*(query|statement): /).size > 1
	end
	def is_continuation
		/^\t/.match(@text) != nil
	end	
	def is_duration_line
		!DURATION_LINE.match(@text).nil?
	end
	def parse_duration_segment		
		@text.split(DURATION_LINE)[1].strip.split(" ")[0].strip.to_f
	end
	def parse_query_segment
		if @is_new_query
			return START_QUERY_LINE.match(@text).post_match.strip
		end
		@text.gsub(/\t/, "").chomp
	end
	def to_s
		@text
	end
end

# this is really a State machine
class PGLogAccumulator
	attr_reader :queries
	def initialize
		@current = nil
		@queries = []
	end
	def new_query_start(line)
		# new query starting, so put the old one in the array
		@queries << @current if !@current.nil?
		@current = Query.new(line.parse_query_segment)
	end
	def query_continuation(line)
		@current.append(line.parse_query_segment) if !@current.nil?
	end
	def set_duration(line)
		if !@current.nil?
			@current.duration = line.parse_duration_segment
			@queries << @current
			@current = nil
		end
	end
end

class Query
	REMOVE_TEXT = Regexp.new("'[^']*'")
	REMOVE_NUMBERS = Regexp.new("[0-9]{1,10}")
	attr_reader :text
	attr_accessor :duration
	def initialize(text="")
		@text = text
	end
	def append(txt)	
		@text << " " << txt
	end
	def normalize
		@text.gsub!(REMOVE_TEXT, "''")
		@text.gsub!(REMOVE_NUMBERS, "0")
		@text.squeeze!(" ")	
		@text.strip!
		@text
	end
	def to_s	
		@text
	end
	def is_select
		check(/^SELECT/i)
	end
	def is_delete
		check(/^DELETE/i)
	end
	def is_insert
		check(/^INSERT/i)
	end
	def is_update
		check(/^UPDATE/i)
	end
	def check(regexp)
		regexp.match(@text.strip) != nil
	end
end

# Reports 
class TextReportAggregator
	def create(reports)	
		rpt = ""
		reports.each {|r| 
			next if !r.applicable
			rpt << r.text 
		}
		rpt
	end
end

class HTMLReportAggregator
	def create(reports) 
		rpt = "<html><head>"
		rpt =<<EOS
<style type="text/css">
body { background-color:white; }
h2 { text-align:center; }
h3 { color:blue }
p, td, th { font-family:Courier, Arial, Helvetica, sans-serif; font-size:14px; }
th { color:white; background-color:#7B8CBE; }
span.keyword { color:blue; }
</style>
EOS
#tr { background-color:#E1E8FD; }
		rpt << "<title>SQL Query Analysis (generated #{Time.now})</title></head><body>\n"
		rpt << "<h2>SQL Query Analysis (generated #{Time.now})</h2><br>\n"
		rpt << "<hr><center>"
		rpt << "<table><th>Reports</th>"
		reports.each_index {|x| 
			next if !reports[x].applicable
			link = "<a href=\"#report#{x}\">#{reports[x].title}</a>"
			rpt << "<tr><td>#{link}</td></tr>"
		}
		rpt << "</table>"
		rpt << "<hr></center>"
		reports.each_index {|x| 
			next if !reports[x].applicable
			rpt << "<a name=\"report#{x}\"> </a>"
			rpt << reports[x].html 
		}
		rpt << "</body></html>\n"
	end
end

class GenericReport
	def initialize(log)
		@log = log 
	end
	def colorize(txt)
		["SELECT","UPDATE","INSERT INTO","WHERE","VALUES","FROM","AND","ORDER BY","GROUP BY","LIMIT", "OFFSET", "DESC","ASC","AS","EXPLAIN","DROP"].each {|w| 
			txt = txt.gsub(Regexp.new(w), "<span class='keyword'>#{w}</span>")
		}
		["select","update","from","where","explain","drop"].each {|w| 
			txt = txt.gsub(Regexp.new(w), "<span class='keyword'>#{w}</span>")
		}
		txt
	end
	def title	
		"Unnamed report"
	end
	def pctg_of(a,b)
		a > 0 ? (((a.to_f/b.to_f)*100.0).round)/100.0 : 0
	end
	def round(x, places)
		(x * 10.0 * places).round / (10.0 * places)
	end
	def applicable
		true
	end
end

class OverallStatsReport < GenericReport
	def html
		rpt = "<h3>#{title}</h3>\n"
		rpt << "#{@log.queries.size} queries\n"
		rpt << "<br>#{@log.unique_queries} unique queries\n"
		if @log.includes_duration
			rpt << "<br>Total query duration was #{round(total_duration, 2)} seconds\n"
			longest = find_longest
			rpt << "<br>Longest query (#{colorize(longest.text)}) ran in #{"%2.3f" % longest.duration} seconds\n"
			shortest = find_shortest
			rpt << "<br>Shortest query (#{colorize(shortest.text)}) ran in #{"%2.3f" % shortest.duration} seconds\n"
		end
		rpt << "<br>Log file parsed in #{"%2.1f" % @log.time_to_parse} seconds\n"
	end
	def title	
		"Overall statistics"
	end
	def text
		rpt = "######## #{title}\n"
		rpt << "#{@log.queries.size} queries (#{@log.unique_queries} unique)"
		rpt << ", longest ran in #{find_longest.duration} seconds)," if @log.includes_duration
		rpt << " parsed in #{@log.time_to_parse} seconds\n"
	end
	def total_duration	
		tot = 0.0	
		@log.queries.each {|x| tot += (x.duration != nil) ? x.duration : 0 }
		tot
	end
	def find_shortest	
		q = Query.new("No queries found")
		q.duration = 10000.0
		@log.queries.each {|x| q = x if !x.duration.nil? && x.duration < q.duration }
		q
	end
	def find_longest	
		q = Query.new("No queries found")
		q.duration = 0.0
		@log.queries.each {|x| q = x if !x.duration.nil? && x.duration > q.duration }
		q
	end
end

class MostFrequentQueriesReport < GenericReport
	def initialize(log, top=DEFAULT_TOP)
		super(log)
		@top = top
	end
	def title	
		"Most frequent queries"
	end
	def html
		list = create_report
		rpt = "<h3>#{title}</h3>\n"
		rpt << "<table><tr><th>Rank</th><th>Times executed</th><th>Query text</th>\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << "<tr><td>#{x+1}</td><td>#{list[x][1]}</td><td>#{colorize(list[x][0])}</td></tr>\n" 
		}
		rpt << "</table>\n"
	end
	def text
		list = create_report
		rpt = "######## #{title}\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << list[x][1].to_s + " times: " + list[x][0].to_s + "\n" 
		}
		return rpt
	end
	def create_report
		h = {}
		@log.queries.each {|q|
			h[q.text] = 0 if !h.has_key?(q.text)
			h[q.text] += 1
		}
		h.sort {|a,b| b[1] <=> a[1] }
	end
end

class LittleWrapper
	attr_accessor :total_duration, :count, :q
	def initialize(q)
		@q = q
		@total_duration = 0.0
		@count = 0
	end
	def add(q)
		return if q.duration.nil?
		@total_duration += q.duration
		@count += 1
	end
end

class QueriesThatTookUpTheMostTimeReport < GenericReport
	def initialize(log, top=DEFAULT_TOP)
		super(log)
		@top = top
	end
	def title	
		"Queries that took up the most time"
	end
	def applicable
		@log.includes_duration
	end
	def html
		list = create_report
		rpt = "<h3>#{title}</h3>\n"
		rpt << "<table><tr><th>Rank</th><th>Total time (seconds)</th><th>Times executed</th><th>Query text</th>\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << "<tr><td>#{x+1}</td><td>#{"%2.3f" % list[x][1].total_duration}</td><td align=right>#{list[x][1].count}</td><td>#{colorize(list[x][0])}</td></tr>\n" 
		}
		rpt << "</table>\n"
	end
	def text
		list = create_report
		rpt = "######## #{title}\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << "#{"%2.3f" % list[x][1].total_duration} seconds: #{list[x][0]}\n" 
		}
		rpt
	end
	def create_report
		h = {}
		@log.queries.each {|q|
			if !h.has_key?(q.text)
				h[q.text] = LittleWrapper.new(q)
			end
			h[q.text].add(q)
		}
		h.sort {|a,b| b[1].total_duration <=> a[1].total_duration }
	end
end

class SlowestQueriesReport < GenericReport
	def initialize(log, top=DEFAULT_TOP)
		super(log)
		@top = top
	end
	def applicable
		@log.includes_duration
	end
	def title	
		"Slowest queries"
	end
	def text
		list = create_report
		rpt = "######## #{title}\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << "#{"%2.3f" % list[x].duration} seconds: #{list[x].text}\n" 
		}
		rpt
	end
	def html
		list = create_report
		rpt = "<h3>#{title}</h3>\n"
		rpt << "<table><tr><th>Rank</th><th>Time</th><th>Query text</th>\n"
		(list.size < @top ? list.size : @top).times {|x| 
				rpt << "<tr><td>#{x+1}</td><td>#{"%2.3f" % list[x].duration}</td><td>#{colorize(list[x].text)}</td></tr>\n" 
		}
		rpt << "</table>\n"
	end
	def create_report
		@log.queries.sort {|a,b| b.duration.to_f <=> a.duration.to_f }.slice(0,@top)
	end
end

class ErrorReport < GenericReport
	def title	
		"Parse Errors"
	end
	def applicable
		!@log.errors.empty?
	end
	def text
		rpt = "######## #{title}\n"
		@log.errors.each {|x|
				rpt << "#{x.exception} : #{x.line}"
		}
		rpt
	end
	def html
		rpt = "<h3>#{title}</h3>\n"
		rpt << "<table><tr><th>Explanation</th><th>Offending line</th>\n"
		@log.errors.each {|x|
				rpt << "<tr><td>#{x.exception.message}</td><td>#{x.line}</td></tr>\n" 
		}
		rpt << "</table>\n"
	end
end

class QueriesByTypeReport < GenericReport
	def title	
		"Queries by type"
	end
	def html
		sel,ins,upd,del=create_report
		rpt = "<h3>#{title}</h3>\n"
		rpt << "<table><tr><th>Type</th><th>Count</th><th>Percentage</th>\n"
		rpt << "<tr><td>SELECT</td><td>#{sel}</td><td align=center>#{(pctg_of(sel, @log.queries.size)*100).to_i}</td></tr>\n" if sel > 0
		rpt << "<tr><td>INSERT</td><td>#{ins}</td><td align=center>#{(pctg_of(ins, @log.queries.size)*100).to_i}</td></tr>\n" if ins > 0
		rpt << "<tr><td>UPDATE</td><td>#{upd}</td><td align=center>#{(pctg_of(upd, @log.queries.size)*100).to_i}</td></tr>\n" if upd > 0
		rpt << "<tr><td>DELETE</td><td>#{del}</td><td align=center>#{(pctg_of(del, @log.queries.size)*100).to_i}</td></tr>\n" if del > 0
		rpt << "</table>\n"
	end
	def text
		sel,ins,upd,del=create_report
		rpt = "######## #{title}\n"
		rpt << "SELECTs: #{sel.to_s.ljust(sel.to_s.size + 1)} (#{(pctg_of(sel, @log.queries.size)*100).to_i}%)\n" if sel > 0
		rpt << "INSERTs: #{ins.to_s.ljust(sel.to_s.size + 1)} (#{(pctg_of(ins, @log.queries.size)*100).to_i}%)\n" if ins > 0
		rpt << "UPDATEs: #{upd.to_s.ljust(upd.to_s.size + 1)} (#{(pctg_of(upd, @log.queries.size)*100).to_i}%)\n" if upd > 0
		rpt << "DELETEs: #{del.to_s.ljust(sel.to_s.size + 1)} (#{(pctg_of(del, @log.queries.size)*100).to_i}%)\n" if del > 0
		rpt
	end
	def create_report
		sel=ins=del=upd=0
		@log.queries.each {|q|
			if q.is_select 
				sel += 1
			elsif q.is_insert
				ins += 1
			elsif q.is_update
				upd += 1
			elsif q.is_delete
				del += 1
			end
		}
		[sel, ins, upd, del]
	end
end

DEFAULT_TOP=10
BUG_URL_STRING="This is a <a href=\"http://pgfoundry.org/tracker/?atid=130&group_id=1000008&func=browse\">bug</a>."

if __FILE__ == $0
	raise "Usage: " + $0 + " [-logtype syslog|pglog|mysql] [-top n] [-normalize] [-format text|html] -file some_log_file_name" if ARGV == nil or !ARGV.include?("-file")
	log = nil
	if ARGV.include?("-logtype") && ARGV[ARGV.index("-logtype")+1] == "syslog"
		log = GenericLogReader.new(ARGV[ARGV.index("-file")+1], "SyslogLine", "SyslogAccumulator")
	elsif ARGV.include?("-logtype") && ARGV[ARGV.index("-logtype")+1] == "mysql"
		log = GenericLogReader.new(ARGV[ARGV.index("-file")+1], "MySQLLogLine", "MySQLAccumulator")
	else
		log = GenericLogReader.new(ARGV[ARGV.index("-file")+1], "PostgresLogLine", "PGLogAccumulator")
	end
	log.parse
	log.normalize if ARGV.include?("-normalize")	
	top = (ARGV.include?("-top") ? ARGV[ARGV.index("-top")+1] : DEFAULT_TOP).to_i
	format = (ARGV.include?("-format") ? ARGV[ARGV.index("-format")+1] : "text")
	rpts = [OverallStatsReport.new(log), QueriesByTypeReport.new(log), QueriesThatTookUpTheMostTimeReport.new(log, top), SlowestQueriesReport.new(log, top), MostFrequentQueriesReport.new(log, top), ErrorReport.new(log)] 
	report_aggregator = (format == "text") ? TextReportAggregator.new : HTMLReportAggregator.new
	puts report_aggregator.create(rpts)
end
