Ticket #2687: parsemergedata.2.R

File parsemergedata.2.R, 8.2 KB (added by tomb, 8 years ago)
Line 
1## A happy script to parse mergedata from task 2672
2##
3## usage: R -f parsemergedata.R
4library("stringr")
5
6Rprof(filename="Rprof.out", interval=0.002)
7
8parsemergedataMain <- function(ARGV) {
9  ## structures
10  setClass("extradataFile",
11           representation(
12                          filename = "character",
13                          guardLabel = "character",
14                          filesizeLabel = "character",
15                          filesize = "numeric"
16                          )
17           )
18
19  ## TODO: ensure all possible lvals are represented in this class
20  ## NOTE: lvals not present in a row will become NAs which is the Right Thing
21  setClass("mergedata",
22           representation(
23                          buildtimes = "numeric",
24                          circ_id = "numeric",
25                          launch = "numeric",
26                          path = "character",
27                          quantile = "numeric",
28                          resolvefailed = "character",
29                          connectsec = "numeric",
30                          connectusec = "numeric",
31                          datacompletesec = "numeric",
32                          datacompleteusec = "numeric",
33                          datarequestsec = "numeric",
34                          datarequestusec = "numeric",
35                          negotiatesec = "numeric",
36                          negotiateusec = "numeric",
37                          readbytes = "numeric",
38                          fail_reasons = "character",
39                          timeout = "numeric",
40                          dataresponsesec = "numeric",
41                          dataresponseusec = "numeric",
42                          stream_fail_reasons = "character"
43                          )
44           )
45  ## methods
46  setMethod("as.character",
47            signature="mergedata",
48            definition=function(x="mergedata", ...) {
49              c(
50                x@buildtimes, ",",
51                x@circ_id, ",",
52                x@buildtimes, ",",
53                x@circ_id, ",",
54                x@launch, ",",
55                x@path, ",",
56                x@quantile, ",",
57                x@resolvefailed, ",",
58                x@connectsec, ",",
59                x@connectusec, ",",
60                x@datacompletesec, ",",
61                x@datacompleteusec, ",",
62                x@datarequestsec, ",",
63                x@datarequestusec, ",",
64                x@negotiatesec, ",",
65                x@negotiateusec, ",",
66                x@readbytes, ",",
67                x@fail_reasons, ",",
68                x@timeout, ",",
69                x@dataresponsesec, ",",
70                x@dataresponseusec, ",",
71                x@stream_fail_reasons, "\n"
72                )
73            }
74            ) 
75 
76  ## globals
77  ## TODO: tomb, change to int debuglev
78  kDebug <- FALSE # set TRUE for debugging output
79  kProgname <- "parsemergedata.R"
80  kVersion <- 0.3
81 
82  ## helper functions
83  debug <- function(str) {
84    if (kDebug) {
85      cat(str, '\n')
86    }
87  }
88 
89  ## parseLine(row, lineNum)
90  ## takes: row - a row of text, lineNum - the line number of that row
91  ## returns: an object of type mergedata populated with the fields in row
92  parseLine <- function(my.row, my.lineNum) {
93    my.lineData <- new("mergedata")
94   
95    debug(c("reading line ", lineNum, ": ", my.row))
96
97    ## Note: we assume delimiter is space or tab
98    ## TODO: tomb, document all assumptions about file format
99    ##       better yet, document complete file spec
100    ##       if you are feeling geeky do it as a formal grammer
101    ##       this should be easy since it is such a simple grammer
102    cols <- unlist(strsplit(my.row, " "))
103   
104    debug("cols: ")
105    for (col in unlist(cols)) {
106#      lval = str_extract(col, "[^=]+")
107      lval = unlist(strsplit(col, "="))
108                                        #      ## rval is used when a single string is expected on right
109#      rval = str_extract(col, "[^=]+$")
110      rval = unlist(strsplit(col, "="))
111      ## rvals is used when a comma delimited list expected on right
112#      rvals = unlist(strsplit(rval, ","))
113#      debug(cat(lval, " = ", rval))
114
115      ## add data to object
116      ## TODO: tomb, add error handling of as.* back in
117      ##       to prevent NAs in output
118      ## NOTE: ignores unknown lvals
119      ## tomb - i decided to handle lvals explicitly to permit typesafety
120      ##        if you don't like typesaftey then you get to maintain this ;-)
121      ## this code is pretty verbose, but i like to allow for future
122      ## sanity checks on a per lval basis
123      ## for example: check that times are non-negative
124      ## if (lval == "BUILDTIMES") {
125      ##   my.lineData@buildtimes <- as.numeric(rvals)
126      ## } else if (lval == "CIRC_ID") {
127      ##   my.lineData@circ_id <- as.numeric(rval)
128      ## } else if (lval == "LAUNCH") {
129      ##   my.lineData@launch <- as.numeric(rval)
130      ## } else if (lval == "PATH") {
131      ##   my.lineData@path <- as.character(rval)
132      ## } else if (lval == "QUANTILE") {
133      ##   my.lineData@quantile <- as.numeric(rval)
134      ## } else if (lval == "RESOLVEFAILED") {
135      ##   my.lineData@resolvefailed <- as.character(rval)
136      ## } else if (lval == "CONNECTSEC") {
137      ##   my.lineData@connectsec <- as.numeric(rval)
138      ## } else if (lval == "CONNECTUSEC") {
139      ##   my.lineData@connectusec <- as.numeric(rval)
140      ## } else if (lval == "DATACOMPLETESEC") {
141      ##   my.lineData@datacompletesec <- as.numeric(rval)
142      ## } else if (lval == "DATACOMPLETEUSEC") {
143      ##   my.lineData@datacompleteusec <- as.numeric(rval)
144      ## } else if (lval == "NEGOTIATESEC") {
145      ##   my.lineData@negotiatesec <- as.numeric(rval)
146      ## } else if (lval == "NEGOTIATEUSEC") {
147      ##   my.lineData@negotiateusec <- as.numeric(rval)
148      ## } else if (lval == "READBYTES") {
149      ##   my.lineData@readbytes <- as.numeric(rval)
150      ## } else if (lval == "FAIL_REASONS") {
151      ##   my.lineData@fail_reasons <- as.character(rval)
152      ## } else if (lval == "TIMEOUT") {
153      ##   my.lineData@timeout <- as.numeric(rval)
154      ## } else if (lval == "DATAREQUESTSEC") {
155      ##   my.lineData@datarequestsec <- as.numeric(rval)
156      ## } else if (lval == "DATAREQUESTUSEC") {
157      ##   my.lineData@datarequestusec <- as.numeric(rval)
158      ## } else if (lval == "DATARESPONSESEC") {
159      ##   my.lineData@dataresponsesec <- as.numeric(rval)
160      ## } else if (lval == "DATARESPONSEUSEC") {
161      ##   my.lineData@dataresponseusec <- as.numeric(rval)
162      ## } else if (lval == "STREAM_FAIL_REASONS") {
163      ##   my.lineData@fail_reasons <- as.character(rval)
164      ## } else {
165      ##   ##        warning("Unrecognized lval on line", lineNum, ": ", lval)
166      ##   ##        return(NULL)
167      ## }
168    } # end for cols
169    return(my.lineData)
170  } ## end readline
171
172  ## main program ----------------------------
173 
174  debug(cat(kProgname, " version ", kVersion))
175
176  ## stuff dealing with input files
177  ## TODO: tomb, genericize arg handleing with code from filter.R
178  ## TODO: tomb, i should really break that code out into seperate
179  ##       functions for easier reuse
180  files <- NULL # files is a list of extradataFiles as definied below
181
182  my.file <- new("extradataFile", filename = "guard-50kb.mergedata")
183  my.file@guardLabel <- "fooGuard"
184  my.file@filesizeLabel <- "fooSizeLabel"
185  my.file@filesize <- 1
186 
187  debug(c("i will read file: ", my.file@filename, ' ',
188          my.file@guardLabel, ' ',
189          my.file@filesizeLabel, ' ',
190          my.file@filesize))
191 
192  files <- c(files, my.file)
193
194  rows <- readLines(my.file@filename)
195  cat(length(rows)," rows\n")
196  lineNum <- 0 # keep track of line number so user can find bad lines in file
197 
198  ## Parse each line, catching and handling errors
199  ## TODO: tomb, add error handling back in :-)
200  outputFile <- file("testfile.csv", "w")
201  my.mergedata <- NULL
202  for (row in rows) {
203    lineNum <- lineNum + 1
204    if (lineNum %% 1000 == 0) {
205      cat("line ", lineNum, " ", date(), "\n")
206    }
207
208    my.mergedata <- parseLine(row, lineNum)
209    if (!is.null(my.mergedata)) {
210      str <- as.character(my.mergedata)
211##      str <- "dummy"
212      cat(str,
213          file = outputFile,
214          append=TRUE)
215    }
216  }
217 
218  debug(cat("data vector length: ", length(mergedata_vector)))
219}
220
221parsemergedataMain()
222warnings()