Wednesday, October 17, 2012

A Scala script for generating summary statistics and frequency distributions from a log file

import scala.collection.mutable
import _root_.java.text.SimpleDateFormat
import _root_.java.util.{TimeZone, Calendar, Date, Locale}
import _root_.java.io._
import java.util.regex._
import java.text.DecimalFormat
import java.text.NumberFormat
import java.lang.StringBuffer
import scala.collection.mutable._
import scala.io.Source
import org.apache.commons.lang.time.DurationFormatUtils
import org.apache.commons.math.stat.Frequency
import org.apache.commons.math.stat.descriptive.SynchronizedSummaryStatistics
/**
* A Scala script for generating summary statistics and frequency distributions from a log file.
*
* Usage:
*
* scala -cp .:commons-lang-2.5.jar:commons-math-2.0.jar StatisticsGenerator.scala <log file location>
*
* Note:
* The log file must have entries containing the format:
* <any content>context=<some context string> time=<#of milliseconds>ms
* Requires Scala 2.8.0.Beta and above
* Loads the entire file into memory so you will want to increase the heap for large files e.g.
* JAVA_OPTS="-Xmx512m" scala -cp .:commons-lang-2.5.jar:commons-math-2.0.jar StatisticsGenerator.scala ./webtest.log > ./webtest-timings.txt
*
* Neil Figg
*/
// match on the number of args supplied
args.length match {
case 1 => println("processing log file " + args(0)); main
case _ => println("incorrect number of args were supplied."); printUsage
}
// The workflow
def main = {
val initialTimings = mapTimings(args(0))
val timings = reduceTimings(initialTimings)
val stats = genertateStatistics(timings)
stats.foreach(stat => println(stat.getAggregateReport()))
}
/**
* Processes one line of the log file at a time
* and emits a list of key-value pairs <context, timing> as a Timing.
*/
def mapTimings(filename: String): List[Timing] = {
val p = Pattern.compile("context=(.*) time=(\\d+)ms")
for {
line <- Source.fromPath(filename).getLines().toList
val m = p.matcher(line)
if m.find
val timing = Timing(m.group(1), m.group(2).toInt)
} yield (timing)
}
/**
* Reduces the list by categorising on the Timing context
*/
def reduceTimings(timings: List[Timing]): Map[String, List[Long]] = {
timings.foldLeft(Map[String, List[Long]]()) {
(map, timing) => map + (timing.context -> (timing.time :: map.getOrElse(timing.context, Nil)))
}
}
/**
* Generate the statistics
*/
def genertateStatistics(timings: Map[String, List[Long]]): List[PerformanceStat] = {
var stats = new ListBuffer[PerformanceStat]()
timings foreach {(kv) =>
val performanceStat = new PerformanceStat(kv._1, kv._2)
stats += performanceStat
}
stats.toList
}
def printUsage = {
println("Usage:")
println("scala -cp .:commons-math-2.0.jar:commons-logging-1.1.1.jar:commons-codec-1.4.jar StatisticsGenerator.scala <log file location>")
}
/**
* Case class to hold key-value pairs <context, timing>
*/
case class Timing(var context: String, var time: Long)
/**
* Generate Summary Statistics and Frequency Distributions
*/
class PerformanceStat(var name: String) {
private var values = List[Long]();
private var newLine: String = System.getProperty("line.separator")
def this (name: String, times: List[Long]) {
this(name)
this.values = times
}
def getName = name
def setName(value:String):Unit = name = value
def addResponseTime(responseTime: Long):Unit = {
values = responseTime :: values
}
def generateFrequencyDistribution(): Frequency = {
var frequency = new Frequency()
values.foreach(frequency.addValue(_))
frequency
}
def generateSummaryStatistics(): SynchronizedSummaryStatistics = {
var summaryStatistics = new SynchronizedSummaryStatistics()
values.foreach(summaryStatistics.addValue(_))
summaryStatistics
}
def getAggregateReport(): String = {
def getDateTime(): String = {
var date = new Date()
var timePattern = "yyyy-MM-dd HH:mm:ss"
var formatter = new SimpleDateFormat(timePattern)
formatter.format(date)
}
val recordCounter = generateSummaryStatistics().getN()
var sb = new StringBuffer()
sb.append(newLine).append(name).append(newLine)
sb.append("----------------------------------------------------").append(newLine).append(newLine)
sb.append("Date: ").append(getDateTime()).append(newLine)
sb.append("# of calls: ").append(recordCounter).append(newLine).append(newLine)
sb.append(getSummaryStatisticsReport()).append(newLine)
sb.append(getFrequencyDistributionReport()).append(newLine).append(newLine)
sb.toString()
}
def getSummaryStatisticsReport(): String = {
var summaryStatistics = generateSummaryStatistics()
var decimalFormat = new DecimalFormat("####,###.00")
var aMax = decimalFormat.format(summaryStatistics.getMax()) + "ms"
var aMin = decimalFormat.format(summaryStatistics.getMin()) + "ms"
var aMean = decimalFormat.format(summaryStatistics.getMean()) + "ms"
var aStd = decimalFormat.format(summaryStatistics.getStandardDeviation()) + "ms"
var sb = new StringBuffer()
sb.append("Summary Statistics").append(newLine)
sb.append("Max: ").append(aMax).append(newLine)
sb.append("Min: ").append(aMin).append(newLine)
sb.append("Mean: ").append(aMean).append(newLine)
sb.append("Standard Deviation: ").append(aStd).append(newLine)
sb.toString()
}
def getFrequencyDistributionReport(): String = {
var frequencyDistribution = generateFrequencyDistribution()
val recordCounter = generateSummaryStatistics.getN()
val timePeriods = List(50,100,150,200,250,300,400,500,1000,2000,3000,60000)
val percentFormatter = NumberFormat.getPercentInstance()
var sb = new StringBuffer()
sb.append("Frequency Distribution").append(newLine)
timePeriods foreach {
timePeriod => sb.append("< " + timePeriod + "ms: ").append(frequencyDistribution.getCumFreq(timePeriod)).append("/").append(recordCounter).append(" (").append(percentFormatter.format(frequencyDistribution.getCumPct(timePeriod))).append(")").append(newLine)
}
sb.toString()
}
}