Package fieldpy :: Package core :: Module raw_file_readers
[hide private]
[frames] | no frames]

Source Code for Module fieldpy.core.raw_file_readers

  1  #!/usr/bin/env python2 
  2  """ 
  3  This file contains a collection of file reader functions which can be 
  4  called when reading in the raw data files.  The datafiles must be in a 
  5  array (1D or 2D) format. 
  6   
  7  The functions must have the following calling signature: 
  8  reader_fun(input_file, other_parameters). 
  9   
 10  The time (in matplotlib format) will has always to be in the first column. 
 11   
 12  And will return a tuple (data, raw_data, metadata).  The data will 
 13  have the format: time (in matplolib format), other (non-time related) 
 14  data columns. 
 15  """ 
 16   
 17  from __future__ import division 
 18   
 19  import numpy as np 
 20  import pylab as plt 
 21  import datetime 
 22  import copy 
 23  import matplotlib.mlab as mlab 
 24  from distutils.version import LooseVersion 
 25   
 26  import pdb 
 27   
 28  # my modules 
 29  from fieldpy.core.extra_classes import Metadata 
 30   
 31  ################################ 
 32  # first a few helper functions 
 33  ################################ 
 34   
35 -def readfile_raw(input_file, separator=None, comment=None, 36 start=0, stop=-1, ignore_empty=False):
37 """Reads a text file line by line and returns the raw data into 38 the nested list raw_data. Ignores all trainling empty lines. 39 40 @type input_file: string 41 @param input_file: The file to read. 42 43 @type separator: string 44 @param separator: Column separator in file, if equals to 'no_split' colums will not be split. 45 46 @type comment: string 47 @param comment: a string which comments the rest of the line 48 49 @type start: int 50 @param start: which line to start on (default 0) 51 52 @type stop: int 53 @param stop: which line to stop on (default -1, i.e. to the end) 54 55 @type ignore_empty: boolena 56 @param ignore_empty: if C{True}, ignore empty lines 57 58 @rtype: list 59 @return: Returns a nested list containing the split raw file lines (as strings). 60 61 >>> readfile_raw('test_files/maw_file_test.maw', separator=',', comment='#') 62 [['2010-07-13 08:49:00', '0', '0.3030', '5', 'asdd asdlkj asl'], ['2010-07-13 08:56:00', '15', '0.2320', '8866', 'asdd asdlkj asl'], ['2010-07-13 08:58:00', '25', '0.2055', '5', '7'], ['2010-07-13 09:03:00', '50', '0.1620', '5', '']] 63 """ 64 65 raw_data = [] 66 rw_opt='rU' 67 line_no = 0 68 with open(input_file, rw_opt) as fil: 69 # ignore the first start-lines: 70 for i in range(start): 71 line = fil.readline() 72 line_no += 1 73 for line in fil: 74 if line_no==stop: 75 break 76 if comment is not None: 77 # middle of line comments 78 line = line.split(comment)[0].strip() 79 if line=='': 80 continue 81 if separator != 'no_split': 82 tmp = line.strip() 83 if ignore_empty and tmp=='': 84 continue 85 tmp = [el.strip() for el in tmp.split(separator)] # pharse 86 else: 87 tmp = line.strip() 88 raw_data.append(tmp) 89 line_no += 1 90 # remove empty lines at the end of the file 91 while True: 92 if raw_data[-1]=='': 93 raw_data.pop() 94 else: 95 break 96 return raw_data
97
98 -def campbell2num_date(campbell_date):
99 """Convert our standart campbell notation into pylab format. 100 101 @type campbell_date: list of lists or numpy array 102 @param campbell_date: A numpy array with Campbell dates 103 104 @returns: numpy array 105 106 >>> cd1 = [[2006, 139, 1245]] 107 >>> cd2 = [[2006, 139, 1245, 34]] 108 >>> campbell2num_date(cd1) 109 array([ 732450.53125]) 110 >>> campbell2num_date(cd2) 111 array([ 732450.53164352]) 112 >>> np.alltrue(cd1 == num_date2campbell(campbell2num_date(cd1))) 113 True 114 >>> np.alltrue(cd2 == num_date2campbell(campbell2num_date(cd2))) 115 False 116 >>> np.alltrue(cd2 == num_date2campbell(campbell2num_date(cd2), secs=True)) 117 True 118 """ 119 if type(campbell_date)!=type(np.array([])): 120 campbell_date = np.array(campbell_date) 121 122 # check for non 2-D arrays 123 if len(campbell_date.shape)==1 or len(campbell_date.shape)>2: 124 raise(TypeError('2-d numpy array or list of lists expected.')) 125 126 if campbell_date.shape[1]==3: # no seconds 127 campbell_date = np.hstack(( campbell_date, np.zeros((campbell_date.shape[0],1)) )) 128 129 out = np.zeros(campbell_date.shape[0]) 130 131 # not vectorised because of datetime.datetime 132 for n,i in enumerate(campbell_date): 133 out[n] =( plt.date2num(datetime.datetime(int(i[0]),1,1)) # year 134 + i[1]-1.0 # days 135 + np.floor(i[2]/100.0)/24.0 # hours 136 + (i[2] - np.floor(i[2]/100.0)*100.0)/24.0/60.0 # minutes 137 + i[3]/24./3600 # seconds 138 ) 139 return out
140
141 -def num_date2campbell(num_date, secs=False):
142 """Convert a numerical date as in pylab to our standart campbell 143 notation (in a narray or a single date). 144 145 @type num_date: numpy array or list 146 @param num_date: vector of pylab dates 147 148 @type secs: boolean 149 @param secs: if C{True} secs are appended 150 151 @return: numpy array with rows [year, julian day, time (,seconds)] 152 153 >>> dt = datetime.datetime(2006,6,6,12,37,25) 154 >>> nd = plt.date2num(dt) 155 >>> num_date2campbell([nd]) 156 array([[2006, 157, 1237]]) 157 >>> num_date2campbell([nd], secs=True) 158 array([[2006, 157, 1237, 25]]) 159 """ 160 if type(num_date)!=type(np.array([])): 161 num_date = np.array(num_date) 162 163 # check for non 1-D arrays 164 if len(np.squeeze(num_date).shape)>1: 165 raise(TypeError('1-d numpy array or a list expected, shape is %s', num_date.shape.__repr__())) 166 167 168 dateobj = plt.num2date(num_date + 0.45/24/60/60) # 0.5/24/60/60 to avoid rounding errors on minutes and seconds 169 out = [] 170 init_tmp_list = [] 171 for i in xrange(len(num_date)): 172 tmp_list = copy.copy(init_tmp_list) 173 tmp_list.append(dateobj[i].year) 174 tmp_list.append(int(np.floor(num_date[i] 175 - plt.date2num(datetime.datetime(dateobj[i].year,1,1)) + 1))) 176 if secs: # to round to the nearst minute if no seconds: 177 tmp_list.append(int(dateobj[i].hour*100 + dateobj[i].minute)) 178 else: 179 tmp_list.append(int(dateobj[i].hour*100 + dateobj[i].minute + int(np.round(dateobj[i].second/60.)))) 180 if secs: 181 tmp_list.append(int(dateobj[i].second)) 182 out.append(tmp_list) 183 return np.array(out, dtype=int)
184
185 -def iso_time_to_date(isostrings, method='hash'):
186 """ 187 Converts a ISO 8601 date & time string (well, slightly perverted) 188 into a matplotlib date number. Note that this implementation is 189 not particularly fast as it uses several try/except blocks. If 190 efficiency is a concern, hard-code it. 191 192 @type isostrings: list of stings 193 @param isostrings: ISO 8601 date & time string: following formats are supported: 194 195 @type method: string 196 @param method: Switch to use different alogrithm. 197 In order of decreasing speed: 198 - magic 40x 199 - hash 3x 200 - fast 4x (needs numpy>1.5) 201 - '' 1x 202 Set to '' to get good error checking/reporting. 203 204 @rtype: np.array of floats 205 @return: matplotlib date numbers 206 207 >>> iso_time_to_date(["2010-07-07 00:00:00"]) 208 array([ 733960.]) 209 >>> iso_time_to_date(["2010-07-07 00:00:00","2010-07-07 00:00:00.5","2010-09-07 03:01:00.5"]) 210 array([ 733960. , 733960.00000579, 734022.12570023]) 211 >>> iso_time_to_date(["2010-07-07 00:00:00","2010-07-07 00:01:00","2010-09-07 03:01:00"]) 212 array([ 733960. , 733960.00069444, 734022.12569444]) 213 """ 214 if not np.iterable(isostrings): 215 raise TypeError('isostrings is not a string nor iterable!') 216 217 # ## this is really slow (5x slower than fast method) 218 # out = np.zeros((len(isostrings))) 219 # for ii,isostr in enumerate(isostrings): 220 # out[ii] = plt.datestr2num(isostr) 221 # return out 222 if method=='fast': # 50% faster 223 tmpar = np.zeros((len(isostrings),7), dtype=int) 224 # parse the date string 225 for ii,isostr in enumerate(isostrings): 226 li = isostr.split('.') 227 if len(li)==2: # need to convert into microseconds 228 tmpar[ii,-1] = int(float('0.' + li.pop()) * 1e6) 229 li = li[0].split(':') 230 tmpar[ii,-2] = int(li.pop()) 231 tmpar[ii,-3] = int(li.pop()) 232 li = li[0].split(' ') 233 tmpar[ii,-4] = int(li.pop()) 234 li = li[0].split('-') 235 tmpar[ii,-5] = int(li.pop()) 236 tmpar[ii,-6] = int(li.pop()) 237 tmpar[ii,-7] = int(li.pop()) 238 # def convert(arr): 239 # # converts array as above into vector of datenums 240 # out = np.ndarray(arr.shape[0]) 241 # for ii, vec in enumerate(arr): 242 # out[ii] = plt.date2num(datetime.datetime(*vec.tolist())) 243 # return out 244 def convert_fast(arr): # 40% faster than above function 245 # converts array as above into vector of datenums 246 out = np.ndarray(arr.shape[0]) 247 # find all the unique year+months 248 [yearmonths, ymind] = np.unique(arr[:,0]+arr[:,1]/100., return_inverse=True) 249 # and turn them into datenums 250 for ii, ym in enumerate(yearmonths): 251 year = int(np.floor(ym)) 252 month = int(round((ym-year)*100)) 253 yearmonths[ii] = datetime.date(year, month, 1).toordinal() 254 # scatter it again 255 out = yearmonths[ymind] 256 # add days 257 out += arr[:,2] - 1 258 # add times (copied from matplotlib's dates.py) 259 HOURS_PER_DAY = 24. 260 MINUTES_PER_DAY = 60.*HOURS_PER_DAY 261 SECONDS_PER_DAY = 60.*MINUTES_PER_DAY 262 MUSECONDS_PER_DAY = 1e6*SECONDS_PER_DAY 263 out += (arr[:,3]/HOURS_PER_DAY + arr[:,4]/MINUTES_PER_DAY + 264 arr[:,5]/SECONDS_PER_DAY + arr[:,6]/MUSECONDS_PER_DAY) 265 return out
266 267 268 return convert_fast(tmpar) 269 elif method=='hash': # this makes a hash to avoid calling datetime.datetime so often 270 out = np.empty(len(isostrings)) 271 year_month_hash = {} 272 # parse the date string and build dict of converted year-months 273 for ii,isostr in enumerate(isostrings): 274 # parse 275 li = isostr.split(':') 276 secs = float(li.pop()) 277 mins = int(li.pop()) 278 li = li[0].split(' ') 279 hours = int(li.pop()) 280 li = li[0].split('-') 281 days = int(li.pop()) 282 months = int(li.pop()) 283 years = int(li.pop()) 284 # year-mont hash 285 yearmonth = years+months/100. 286 if not year_month_hash.has_key(yearmonth): 287 year_month_hash[yearmonth] = datetime.date(years, months, 1).toordinal() 288 # year 289 out[ii] = year_month_hash[yearmonth] 290 # add days 291 out[ii] += days - 1 292 # add times (copied from matplotlib's dates.py) 293 HOURS_PER_DAY = 24. 294 MINUTES_PER_DAY = 60.*HOURS_PER_DAY 295 SECONDS_PER_DAY = 60.*MINUTES_PER_DAY 296 out[ii] += (hours/HOURS_PER_DAY + mins/MINUTES_PER_DAY + 297 secs/SECONDS_PER_DAY) 298 return out 299 elif method=='magic': # this works by some numpy magic to vectorise it 300 str_len = 30 301 # convert to array of strings 302 if type(isostrings) == np.ndarray and np.issubdtype(isostrings.dtype, np.dtype('O')): 303 isostrings = isostrings.astype('S'+str(str_len)) 304 elif type(isostrings) == list: 305 isostrings = np.array(isostrings,'S'+str(str_len)) 306 # string array viewn as bytes (charaters) 307 isobytes = isostrings.view(np.byte) 308 isobytes = isobytes.reshape((isostrings.shape[0], str_len)) 309 # now this is an array of charaters 310 isoints = isobytes - 48 # subtracting 48 from ASCII numbers gives their integer values 311 isoints[isoints==-48] = 0 # set empty strings to zero 312 # add times (copied from matplotlib's dates.py) 313 years = np.sum(isoints[:,0:4]*np.array([1000,100,10,1]),1) 314 months = np.sum(isoints[:,5:7]*np.array([10,1]),1) 315 years_months = years+months/100. 316 # make a hash for all possible year-months between 317 # years[0] and years[-1] -> this should be efficient for time series which have many datapoints per month 318 year_month_hash = {} 319 for year in range(years[0], years[-1]+1): 320 for month in range(1,13): 321 year_month = year+month/100. 322 year_month_hash[year_month] = datetime.date(year, month, 1).toordinal() 323 # convert into days (in matplotlib date-format) 324 days = np.empty(len(isostrings)) 325 for k in year_month_hash: 326 days[years_months==k] = year_month_hash[k] - 1 327 # and the rest is easy 328 HOURS_PER_DAY = 24. 329 MINUTES_PER_DAY = 60.*HOURS_PER_DAY 330 SECONDS_PER_DAY = 60.*MINUTES_PER_DAY 331 days += np.sum(isoints[:,8:10]*np.array([10,1]),1) 332 days += 1/HOURS_PER_DAY * np.sum(isoints[:,11:13]*np.array([10,1]),1) 333 days += 1/MINUTES_PER_DAY * np.sum(isoints[:,14:16]*np.array([10,1]),1) 334 days += 1/SECONDS_PER_DAY * np.sum(isoints[:,17:19]*np.array([10,1]),1) 335 if str_len>19: # have fractional seconds too 336 days += 1/SECONDS_PER_DAY * np.sum(isoints[:,20:]*np.logspace(-1, -(str_len-20), 10),1) 337 return days 338 else: # will catch errors better 339 out = [] 340 for isostr in isostrings: 341 try: 342 out.append(plt.date2num(datetime.datetime.strptime(isostr, '%Y-%m-%d %H:%M:%S'))) 343 continue 344 except ValueError: 345 pass 346 try: 347 out.append(plt.date2num(datetime.datetime.strptime(isostr, '%Y-%m-%d %H:%M:%S.%f'))) 348 continue 349 except ValueError: 350 raise 351 return np.array(out) 352 353 ################################ 354 ## READER FUNCTIONS 355 ################################ 356
357 -def read_campbell_cr10x(input_file, headers=None, secs=False, year=None):
358 """ 359 Reads the file in standard Campbell CR10X dataformat: 360 361 number, year, julian day, time, data, ... 362 363 Or if year is not None: 364 number, julian day, time, data, ... 365 366 @type input_file: string 367 @param input_file: input file name 368 369 @type headers: [string] 370 @param headers: a list of headers to be given to the variable columns 371 (default is [var1, var2, var3...]) 372 373 @type secs: boolean 374 @param secs: If true the fifth row is interpreted as seconds else as data 375 376 @type year: integer 377 @param year: If not None, then it is interpreted that the datafile 378 contains no year column and value of 'year' parameter is 379 used. (note, the colums are counted from zero) 380 381 @rtype: tuple 382 @return: tuple (data, raw_data, metadata) 383 384 >>> data, raw_data, metadata = read_campbell_cr10x('test_files/cr10x.dat') 385 >>> data, metadata, raw_data # doctest:+ELLIPSIS 386 (array([(732102.9722222222, -0.30595, 3.2896, 335.44), 387 (732102.9791666667, -0.30629, 3.2656, 332.99), 388 (732102.9861111111, -0.27962, 3.2405, 330.43), 389 (732102.9930555556, -0.30513, 3.205, 326.81), 390 (732103.0, -0.30523, 3.1689, 323.13), 391 (732103.0069444445, -0.30457, 3.141, 320.29)], 392 dtype=[('time', '<f8'), ('var0', '<f8'), ('var1', '<f8'), ('var2', '<f8')]), {'headers': ['time', 'var0', 'var1', 'var2'], 393 'input_file': 'test_files/cr10x.dat', 394 'raw_headers': ['station number', 395 'year', 396 'julian day', 397 'time', 398 'var0', 399 'var1', 400 'var2'], 401 'secs': False, 402 'units': [], 403 'year': None}, array([[ 1.05000000e+02, 2.00500000e+03, 1.56000000e+02, 404 2.32000000e+03, -3.05950000e-01, 3.28960000e+00, 405 3.35440000e+02], 406 ... 407 """ 408 # initialise 409 metadata = Metadata() 410 metadata.__dict__['input_file'] = input_file 411 metadata.secs = secs 412 metadata.year = year 413 414 first_line = readfile_raw(input_file, stop=1, separator=',')[0] 415 # if no headers are given make a standard one 416 header_len = len(first_line) 417 if headers is None: 418 if secs: 419 if year is None: 420 head = ['station number', 'year', 'julian day', 'time', 'secs'] 421 else: 422 head = ['station number', 'julian day', 'time', 'secs'] 423 else: 424 if year is None: 425 head = ['station number', 'year', 'julian day', 'time'] 426 else: 427 head = ['station number', 'julian day', 'time'] 428 std_head_len = len(head) 429 headers = head + ['var'+str(ii) for ii in range(header_len - std_head_len)] 430 else: # otherwise calcualte std_head_len 431 # check 432 if len(headers)!=len(first_line): 433 raise TypeError('Given header does not have the same length as the first row of the file.') 434 if secs: 435 if year is None: 436 std_head_len = 5 437 else: 438 std_head_len = 4 439 else: 440 if year is None: 441 std_head_len = 4 442 else: 443 std_head_len = 3 444 metadata.raw_headers = headers 445 metadata.headers = ['time'] + headers[std_head_len:] 446 447 # read the file: 448 raw_data = np.genfromtxt(input_file, delimiter=',') 449 tmp_dat = copy.deepcopy(raw_data) 450 451 # convert time 452 if year is not None: 453 tmp_dat = np.hstack((tmp_dat[:,0], year*np.ones(tmp_dat.shape[0]), tmp_dat[:,1:])) 454 if secs: 455 last_ind_time = 5 456 else: 457 last_ind_time = 4 458 tmp_t = campbell2num_date(tmp_dat[:,1:last_ind_time]) 459 tmp_t = tmp_t[:,np.newaxis] 460 tmp_dat = np.hstack((tmp_t, tmp_dat[:,last_ind_time:])) 461 462 # figure out the 'data' dtypes (a bit akward due to numpy) 463 dtype_data = np.dtype([(head, np.float64) for head in metadata.headers]) 464 465 # create data with a 'view': note that there are some issues with strides thus the ascontiguousarray 466 data = np.ascontiguousarray(tmp_dat).view(dtype_data).squeeze() 467 468 return data, raw_data, metadata
469
470 -def read_campbell_TAO5(input_file, given_headers=[]):
471 """ 472 Reads the file in TAO5 Campbell dataformat as used by CR1000: 473 474 Resources: 475 http://www.campbellsci.com/documents/manuals/loggernet_3-1.pdf 476 Section B.1.4 477 478 Header format 479 file format, station, logger type, serial number, OS version, logger-program file name, logger-program file checksum, table name 480 "TIMESTAMP","RECORD",fieldname,fieldname,... 481 "TS","RN", field-units, field-units,... 482 "","",field recording method,field recording method,... 483 484 If the fieldname is not specified then a header of format 'var1' 485 etc will be given, except if specified in the list L{given_headers}. 486 487 @type input_file: string 488 @param input_file: input file name 489 490 @type given_headers: list 491 @param given_headers: list of header names to give in the record 492 array data. If an entry is None then the 493 default one is used. Note that the field 494 'RECORD' is ignored in the data and thus 495 does not feature in this list. 496 497 @rtype: tuple 498 @return: tuple (data, raw_data, metadata) 499 500 @note: It is assumed that any string-like thing is a date+time string 501 502 >>> d,rd,md = read_campbell_TAO5('test_files/TOA5_cr1000.dat') 503 >>> d,rd,md # doctest:+ELLIPSIS 504 (array([ (733960.0, 13.72, 12.6, 733959.9930787038, 13.43, 10.2, 733959.5997685185, 4.493, 7), 505 (733961.0, 13.78, 12.48, 733960.2569675926, 13.15, 17.09, 733960.6921296297, 4.064, 8), 506 (733962.0416666666, 13.74, 12.5, 733961.2257175926, 13.07, 17.36, 733961.6785185186, 5.637, 10)], 507 dtype=[('TIMESTAMP', '<f8'), ('Batt_Volt_Max', '<f8'), ('Batt_Volt_Min', '<f8'), ('Batt_Volt_TMn', '<f8'), ('Batt_Volt_Avg', '<f8'), ('Panel_Temp_Max', '<f8'), ('Panel_Temp_TMx', '<f8'), ('var7', '<f8'), ('Panel_Temp_Avg', '<i8')]), ... 508 """ 509 metadata = Metadata() 510 TOA5_info = {} 511 TOA5_info_fields_line1 = ['file_format', 'station', 'logger_type', 'serial_number', 512 'OS_version', 'logger-program_file_name', 513 'logger-program_file_checksum', 'table_name'] 514 515 header_lines = readfile_raw(input_file, stop=5, separator=',') 516 ## process header 517 for ii,key in enumerate(TOA5_info_fields_line1): 518 try: 519 TOA5_info[key] = header_lines[0][ii].strip('"') 520 except: 521 TOA5_info[key] = header_lines[0][ii] 522 TOA5_info['fields'] = [st.strip('"') for st in header_lines[1]] 523 TOA5_info['units'] = [st.strip('"') for st in header_lines[2]] 524 TOA5_info['recording_type'] = [st.strip('"') for st in header_lines[3]] 525 metadata.__dict__['TOA5_info'] = TOA5_info 526 metadata.__dict__['input_file'] = input_file 527 metadata.raw_units = TOA5_info['units'] 528 metadata.units = metadata.raw_units[0:1] + metadata.raw_units[2:] 529 # fill metadata 530 metadata.headers = [] 531 metadata.raw_headers = [] 532 ind = -1 533 # check consistency of given_headers if passed in 534 if len(given_headers)>0 and len(given_headers)!=(len(TOA5_info['fields'])-1): 535 raise ValueError('Variable given_headers is not of right length. It is %i but should be %i' % (len(given_headers), len(TOA5_info['fields'])-1)) 536 for ii,hd in enumerate(TOA5_info['fields']): 537 if hd=="RECORD": 538 metadata.raw_headers.append(hd) 539 continue # skip that field for data 540 ind += 1 541 if len(given_headers)>0 and given_headers[ind] is not None: 542 metadata.raw_headers.append(given_headers[ind]) 543 metadata.headers.append(given_headers[ind]) 544 continue 545 if hd!='': 546 metadata.raw_headers.append(hd) 547 metadata.headers.append(hd) 548 else: 549 head = 'var' + str(ind) 550 metadata.raw_headers.append(head) 551 metadata.headers.append(head) 552 # figure out the datatypes of the data 553 first_line = header_lines[-1] 554 raw_dtypes = [] 555 type_dict = {int: 'int', 556 str: 'str', 557 float: 'float'} 558 for field in first_line: # eval does the magic thanks to TOA5 quoting all strings 559 raw_dtypes.append(type_dict[type(eval(field))]) 560 metadata.raw_dtypes = raw_dtypes 561 562 ## read the data 563 # 1) we want varible length strings in the numpy array so use 'np.object' datatype 564 dt = [rdt if rdt!='str' else 'O' for rdt in metadata.raw_dtypes] 565 # add the header to the dtype 566 dtypes_raw = np.dtype(zip(metadata.raw_headers,dt)) 567 # 2) make a converter to remove double quotes 568 remove_double_quotes = lambda str_: str_.replace('"', '') 569 converters = {} 570 for ii,dt in enumerate(metadata.raw_dtypes): 571 converters[ii] = remove_double_quotes 572 # 3) finally read the file 573 # (there was an API change in genfromtxt thus this if/else) 574 if LooseVersion(np.__version__)<LooseVersion('1.5'): 575 raw_data = mlab.csv2rec(input_file, names=metadata.raw_headers, 576 skiprows=4, delimiter=',').view(np.ndarray) 577 # raw_data = np.genfromtxt(input_file, delimiter=',', skiprows=4, 578 # dtype=dtypes_raw, converters=converters) 579 580 ## figure out the 'data' dtypes (a bit akward due to numpy) (forget about record number) 581 dtype_data = [raw_data.dtype[ii] for ii in [0]+range(2,len(raw_data.dtype))] 582 # we assume that all 'object's are datetimes and will be converted, thus object->float 583 dtype_data = [dt if dt!=np.object else np.dtype(np.float) for dt in dtype_data] 584 585 # initialise 'data' 586 data = np.zeros(len(raw_data), dtype=zip(metadata.headers, dtype_data)) 587 588 #pdb.set_trace() 589 # now make the data, i.e. convert time strings into matplotlib date-numbers 590 for head in metadata.headers: 591 if raw_data.dtype[head]==np.dtype('object'): 592 # convert first to datetime then datenum 593 data[head] = plt.date2num(raw_data[head]) 594 else: 595 data[head] = raw_data[head] 596 597 598 else: 599 raw_data = np.genfromtxt(input_file, delimiter=',', skip_header=4, 600 dtype=dtypes_raw, converters=converters) 601 602 ## figure out the 'data' dtypes (a bit akward due to numpy) (forget about record number) 603 dtype_data = [raw_data.dtype[ii] for ii in [0]+range(2,len(raw_data.dtype))] 604 # we assume that all strings are datetimes and will be converted, thus object->float 605 dtype_data = [dt if dt!=np.object else np.dtype(np.float) for dt in dtype_data] 606 607 # initialise 'data' 608 data = np.zeros(len(raw_data), dtype=zip(metadata.headers, dtype_data)) 609 # now make the data, i.e. convert time strings into matplotlib date-numbers 610 for head in metadata.headers: 611 if raw_data.dtype[head]==np.dtype('object'): 612 # convert first to datetime then datenum 613 data[head] = iso_time_to_date(raw_data[head]) 614 else: 615 data[head] = raw_data[head] 616 617 return data, raw_data, metadata
618
619 -def read_maw_file(input_file):
620 """ 621 Reads a standart MAW file (as only used by me, Mauro A Werder) 622 with format: 623 624 #maw name of dataset 625 # comment line 626 #metadata is an metadata tag: 627 #metadata.eg = 'asdf' 628 # will create a attribute in metadata.eg with value 'asdf' 629 #metadata.num = '1.234' 630 # 631 # the last comment line has the format and will be put into 632 # metadata['headers'], metadata['units'] and use as datatype: 633 # name0 (units) [dtype], name1 (units) [dtype], name2 (units) [dtype], ... 634 val0, val1, val2 ... 635 . 636 . 637 . 638 639 dtypes is one of the following: int, float, str, time_str 640 641 Time is represented as an ISO 8601 sting: "yyyy/mm/dd HH:MM:SS(.FF)" 642 excluding the 'T' without time zone information (which should be 643 given in the units as eg (UTC-7)). 644 645 The idea is to have a easy to parse text represenation of (a 646 subset of) what can be contained in a netcdf3 file. 647 648 @type input_file: string 649 @param input_file: input file name 650 651 @rtype: tuple 652 @return: tuple (data, raw_data, metadata) 653 654 >>> d,rd,md = read_maw_file('test_files/maw_file_test.maw') 655 >>> d,rd,md 656 (array([(733966.3673611111, 0.0, 0.303, 5, 'asdd asdlkj asl'), 657 (733966.3722222223, 15.0, 0.232, 8866, 'asdd asdlkj asl'), 658 (733966.3736111111, 25.0, 0.2055, 5, '7'), 659 (733966.3770833333, 50.0, 0.162, 5, '')], 660 dtype=[('time', '<f8'), ('var1', '<f8'), ('var2', '<f8'), ('var3', '<i8'), ('var4', '|O8')]), array([('2010-07-13 08:49:00', 0.0, 0.303, 5, 'asdd asdlkj asl'), 661 ('2010-07-13 08:56:00', 15.0, 0.232, 8866, 'asdd asdlkj asl'), 662 ('2010-07-13 08:58:00', 25.0, 0.2055, 5, '7'), 663 ('2010-07-13 09:03:00', 50.0, 0.162, 5, '')], 664 dtype=[('time', '|O8'), ('var1', '<f8'), ('var2', '<f8'), ('var3', '<i8'), ('var4', '|O8')]), {'calibaration_solution_concentration': 10.0, 665 'calibaration_solution_concentration_units': 'g/l', 666 'dtypes': ['time_str', 'float', 'float', 'int', 'str'], 667 'experimenter': 'MAW + UM', 668 'headers': ['time', 'var1', 'var2', 'var3', 'var4'], 669 'raw_headers': ['time', 'var1', 'var2', 'var3', 'var4'], 670 'title': 'Test file', 671 'units': ['UTC-7', 'ml', '', 'm^3', '']}) 672 """ 673 # INITIALISE 674 raw_data = readfile_raw(input_file, separator='no_split') 675 data = [] 676 comment = [] 677 metadata = Metadata() 678 679 # PRE-PARSE FILE 680 for line in raw_data: 681 if line.startswith('#'): # get rid of comments 682 comment.append(line.strip()) 683 else: 684 tmp = line.split(',') 685 tmp = [tt.strip(' "') for tt in tmp] 686 data.append(tuple(tmp)) 687 688 # NOW MAKE THE METADATA STURCTURE 689 if not comment[0].startswith('#maw'): 690 raise TypeError("File does not start with '#maw'") 691 else: 692 metadata.title = comment[0].strip('#maw ') 693 for line in comment: 694 if line.startswith('#metadata.'): 695 tmp = line.split('#metadata.')[1] 696 tmp = tmp.split('=') 697 tmp = [t.strip() for t in tmp] 698 # use eval, so basically anything python handles can be written after the = 699 if tmp[1]!='nan': 700 metadata.__dict__[tmp[0]] = eval(tmp[1]) 701 else: 702 metadata.__dict__[tmp[0]] = float('nan') 703 704 # PARSE LAST COMMENT LINE: 705 last_line = [st.strip() for st in comment[-1].strip('#').split(',')] 706 headers = [] 707 units = [] 708 dtypes = [] 709 for head in last_line: 710 tmp = head.split('(') 711 headers.append(tmp[0].strip()) 712 tmp = tmp[1].split(')') 713 units.append(tmp[0].strip()) 714 dtypes.append(tmp[1].strip(' []')) 715 if dtypes[-1]=='': 716 raise ValueError('No datatype given in file') 717 metadata.__dict__['headers'] = headers 718 metadata.__dict__['raw_headers'] = headers 719 metadata.__dict__['units'] = units 720 metadata.__dict__['dtypes'] = dtypes 721 # the dtype for the time string is just a sting but we use 722 # np.object to have varible length strings 723 dtypes_raw = [np.dtype(dt) if dt!='time_str' 724 else np.dtype('O') for dt in dtypes] 725 # same goes for strings 726 dtypes_raw = [np.dtype(dt) if dt!='str' 727 else np.dtype('O') for dt in dtypes_raw] 728 # but once converted into a matplotlib datenumber it's a float 729 dtypes_processed = [np.dtype(dt) if dt!='time_str' 730 else np.dtype(float) for dt in dtypes] 731 # normal strings stay objects 732 dtypes_processed = [np.dtype(dt) if dt!='str' 733 else np.dtype('O') for dt in dtypes_processed] 734 735 # PARSE DATA 736 # raw_data contains all as it was in the file 737 raw_data = np.array(data, zip(headers, dtypes_raw)) # this will convert the stings as specified in dtypes_raw 738 # initialise 739 data = np.zeros(len(raw_data), dtype=zip(headers, dtypes_processed)) 740 # now make the data, i.e. convert time strings into matplotlib date-numbers 741 for ii, head in enumerate(headers): 742 if dtypes[ii]=='time_str': 743 data[head] = iso_time_to_date(raw_data[head]) 744 else: 745 data[head] = raw_data[head] 746 747 return data, raw_data, metadata
748 749 750 ##################################### 751 ## add your own reader functions here 752 ##################################### 753 754 755 # if __name__=='__main__': 756 # import doctest 757 # doctest.testmod() 758