Here are the topics we touched, with some weblinks to tutorials or discussions:
Possible considerations:
Python lists (and nested lists) are not usually suitable for mathematical work; they do not understand arithmetics natively, they are not efficient (because they require levels of indirection), and they require looping in Python, which is again not efficient.
By contrast, working with mathematical vectors and arrays requires fast operations on large memory blocks that contain homogeneous types (all ints, doubles, etc.). This is also the memory layout that's expected by C/Fortran programs, so interacting with external legacy or performance codes requires this same "buffer" paradigm.
This is what's provided by numpy, which is a building block and a basic infrastructure for most scientific applications in Python.
See Johansson's numpy tutorial; also on scipy, Python's "ecosystem" open-source scientific software, and on matplotlib.
import numpy as np
import matplotlib.pyplot as pp
%matplotlib inline
import scipy
We'll represent the grid with a numpy array of integers. Let's start by initializing it randomly. numpy.random.rand
yields reals in [0,1], so we round them to 0 (dead) or 1 (alive).
nsize = 32
init = np.floor(np.random.rand(nsize,nsize) + 0.5)
Or we could convert a boolean array (which will be True/False with 50% probability) to integers...
(np.random.rand(nsize,nsize) > 0.5) * 1
By the way, here's how you can count true elements in a boolean array:
np.sum(np.random.rand(nsize,nsize) > 0.5)
Now we'd like to plot the array. imshow
from matplotlib.pyplot
seems to work well.
def plotgrid(x):
# turn off interpolation to get hard cell edges; put (0,0) at bottom left;
# use a gray colormap that will yield black/white
pp.imshow(x,interpolation='none',origin='bottom',cmap=pp.cm.gray)
plotgrid(init)
Time to implement the evolution in the Game of Life; how to compute the number of neighbors? We could loop brute force, using modulo arithmetic to implement wrap-around boundary conditions...
def lifestep_brute(x):
newx = x.copy()
m, n = x.shape
for i in range(m):
for j in range(n):
neighbors = (x[(i+1) % m,(j-1) % n] + x[(i+1) % m,j] + x[(i+1) % m,(j+1) % n] +
x[i, (j-1) % n] + x[i, (j+1) % n] +
x[(i-1) % m,(j-1) % n] + x[(i-1) % m,j] + x[(i-1) % m,(j+1) % n])
if x[i,j] == 1 and (neighbors < 2 or neighbors > 3):
newx[i,j] = 0
elif x[i,j] == 0 and neighbors == 3:
newx[i,j] = 1
return newx
This works OK, although it's not especially fast...
# let's make a function to run some steps with a choice of "stepper"
def runiter(stepper,iters=100,nsize=64):
x = np.floor(np.random.rand(nsize,nsize) + 0.5)
for i in range(iters):
x = stepper(x)
%timeit runiter(lifestep_brute,100)
We can plot a few frames using matplotlib.pyplot.subplot
, which works just like matlab's.
pp.figure(figsize=(12,12))
x = np.floor(np.random.rand(64,64) + 0.5)
for i in range(16):
pp.subplot(4,4,i+1)
plotgrid(x)
x = lifestep(x)
In numpy
, it's always more efficient to work with full arrays; however in this case this means that we need to do something special on the boundaries to get the wraparound effect. That's a lot of special cases to write in our code (the four edges, plus the four corners, all require different indexing).
We could make an extended copy of the array by pasting the last column to the left of the first, the first column to the right of last, etc. Or, we look in the numpy
manual and find the function roll
that yields a rolled-with-wraparound copy of a matrix. (numpy.lookfor
, and in this case numpy.lookfor('shift')
is a good way to find these things!)
def lifestep_roll(x):
newx = x.copy()
neighbors = 0
for di in [(1,1),(1,0),(1,-1),(0,1),(0,-1),(-1,1),(-1,0),(-1,-1)]:
neighbors += np.roll(np.roll(x,di[0],axis=0),di[1],axis=1)
# notice the numpy *fancy indexing* to work on selected items of an array
# based on some conditions
newx[(x == 1) & (neighbors < 2)] = 0
newx[(x == 1) & (neighbors > 3)] = 0
newx[(x == 0) & (neighbors == 3)] = 1
return newx
It's 60 times faster!
%timeit runiter(lifestep_roll,100)
...and somebody who's mathematically inclined may realize that the neighbor counting operation is a 2D convolution, which must certainly be implemented in one of these libraries. We do need to make sure to pass the right options to get back a matrix of the right size, and to impose the periodic boundary conditions.
import scipy.signal
kernel = np.array([[1,1,1],[1,0,1],[1,1,1]])
def lifestep_conv(x):
newx = x.copy()
neighbors = scipy.signal.convolve2d(x,kernel,boundary='wrap',mode='same')
newx[(x == 1) & (neighbors < 2)] = 0
newx[(x == 1) & (neighbors > 3)] = 0
newx[(x == 0) & (neighbors == 3)] = 1
return newx
For this size of matrix, the timing is actually comparable to the rolling version.
%timeit runiter(lifestep_conv,100)
Last, somebody who's very mathematically inclined may realize that the convolution can be performed efficiently with FFTs. It takes some trial and error to figure out some necessary adjustments.
import numpy.fft as npf
def fft_convolve2d(x,y):
"""2D convolution using FFT, adapted from Tristan Hearn's example."""
fr = npf.fft2(x)
fr2 = npf.fft2(y,s=fr.shape)
convolve = np.real(npf.ifft2(fr * fr2))
# the result needs to be shifted around
m, n = y.shape
convolve = np.roll(convolve,-int((m-1)/2),axis=0)
convolve = np.roll(convolve,-int((n-1)/2),axis=1)
return convolve.round()
def lifestep_fftconv(x):
newx = x.copy()
neighbors = fft_convolve2d(x,kernel)
newx[(x == 1) & (neighbors < 2)] = 0
newx[(x == 1) & (neighbors > 3)] = 0
newx[(x == 0) & (neighbors == 3)] = 1
return newx
This is slower!
%timeit runiter(lifestep_fftconv,100)
We should really make an animation! matplotlib
offers the tools to do so, but the interface is not the best. It requires that we make a figure, then provide a function to modify it for each frame. See for instance this blog post.
frames = []
x = np.floor(np.random.rand(256,256) + 0.5)
# we collect the frames in a list
for i in range(250):
frames.append(x)
x = lifestep_conv(x)
This will take a few seconds...
import matplotlib.animation as ma
# default resolution is 72 dpi
figure = pp.figure(figsize=(5,5))
image = pp.imshow(frames[i],interpolation='none',cmap=pp.cm.gray)
# the animate function needs to change the elements of the figure,
# and return an iterable object containing the parts of the figure that have changed
def animate(i):
image.set_array(frames[i])
return [image]
# the API is not the best though...
anim = ma.FuncAnimation(figure,animate,frames=len(frames),
interval=20,blit=True)
anim.save('life.mp4',fps=30);
You'll have to open the animation and watch it outside the notebook. The current version of iPython notebook has trouble embedding movies. Can you see some gliders rolling by?
!open life.mp4
Modify our code to implement Stephan Rafler's SmoothLife floating-point generalization of the Game of Life. See his paper.
We went over this very quickly in class, so I will just paste in the code from last year. If you don't have h5py, try conda
or pip
.
It's HDF5, a "data model, library, and file format", "portable and extensible", that "supports an unlimited variety of datatypes, and is designed for flexible and efficient I/O and for high volume and complex data". h5py is the Python library to read/write HDF5; it can be used to get efficient numpy views on very large datafiles.
LIGO has recently released its flagship "S5" data: https://losc.ligo.org
Following the tracks of the webinar at https://dcc.ligo.org/public/0113/G1400509/004/losc_webinar.pdf. We'll look for "H1" data starting at time 843894784 and ending at 843898880.
!wget https://losc.ligo.org/archive/data/S5/843055104/H-H1_LOSC_4_V1-843894784-4096.hdf5
!pip install h5py
import h5py
data = h5py.File('H-H1_LOSC_4_V1-843894784-4096.hdf5','r')
data
HDF5 file objects have a tree-like structure, and in the Python API tree nodes at every level behave as Python dictionaries. Remember the Python dict
methods, which return iterators (technically, dictionary views) that you can loop over:
keys()
, yielding the keys indexing the dictionary,values()
, yielding the values in the same order,items()
, yielding (key,value) pairs.If you need a proper list of any of these, you can always feed the iterators to the list()
constructor.
list(data.items())
h5py implements the somewhat peculiar visit
method that walks down the HDF tree in full.
def printname(name):
print("%-35s %s" % (name,data[name]))
data.visit(printname)
There's useful information in "meta
":
data['meta/GPSstart'].value, data['meta/Duration'].value
"strain
" seems to be the main structure containing data...
strain = data['strain/Strain']
...which looks like a numpy array...
type(strain.value), strain.value
But it has also attributes:
list(strain.attrs.items())
We use these to make an array of timestamps.
tstart = strain.attrs['Xstart']
dt = strain.attrs['Xspacing']
tend = tstart + strain.attrs['Npoints'] * dt
# according to C convention, "tend" is not actually included in this array
ts = np.arange(tstart,tend,dt)
len(ts), len(strain.value)
Let's plot the very beginning of the data.
pp.plot(ts[:10000],
strain.value[:10000])
There's also "quality
", which (we learn from https://losc.ligo.org/archive/dataset/S5) describes the quality of the data with a 1-Hz bitmask array.
quality = data['quality/simple']
list(quality.items())
These are the meanings of the bits:
for bit,(name,desc) in enumerate(zip(quality['DQShortnames'].value,quality['DQDescriptions'].value)):
print('%2d %-16s %s' % (bit, name.decode(), desc.decode()))
A note on the decode
, which is needed in Python 3 to avoid displaying strings as b'...'
. The issue is that in Python 3 native strings are unicode, but the LOSC files contain simple ASCII strings (where each character is 8 bits), which in Python 3 correspond to bytes
object, not quite a string. The bytes
method decode()
, with the implied argument 'UTF-8'
, uses a unicode encoding to make a unicode string out of a sequence of bytes. Its inverse is encode()
.
Back to LIGO. How much data at "CBCHIGH" categories 1 through 4? CBCHIGH denotes the search for high-mass binary inspirals. As for the categories,
A LIGO search would use CAT2 data; an upper-limit analysis would use CAT3.
dqarray = quality['DQmask'].value
for cbchigh_cat in [1,2,3,4]:
bitmask = 1 << cbchigh_cat
print("%s seconds at CAT %d" % (sum(dqarray & bitmask > 0),cbchigh_cat))
We are going to work with CAT2 data. Let's plot which data is CAT2 in "raster" fashion, reshaping a vector to a 2D matrix. We'll use matplotlib.pyplot.matshow
the latter. We want a discrete rather than continuous colormap (i.e., 1 = red, 2 = yellow, 3 = green, 4 = blue). Some quick googling shows that this can be achieved with ListedColormap
and BoundaryNorm
.
dqplot = np.zeros(dqarray.shape)
for cbchigh_cat in [1,2,3,4]:
bitmask = 1 << cbchigh_cat
dqplot[dqarray & bitmask > 0] = cbchigh_cat
import matplotlib.colors as mpc
cmap = mpc.ListedColormap(['red','yellow','green','blue'])
norm = mpc.BoundaryNorm([0,1,2,3],cmap.N)
pp.matshow(dqplot.reshape((16,256)),cmap=cmap,norm=norm,aspect=1.5);
We use numpy masked arrays to obtain the segments of CAT2 data. * http://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html
dq2 = (dqarray & (1 << 2)) > 0
masked = np.ma.masked_array(np.zeros(4096),~dq2)
segs = np.ma.flatnotmasked_contiguous(masked)
print(segs)
This will get us the strain data for segment n
.
def segdata(n):
start, end = segs[n].start * 4096, segs[n].stop * 4096
return ts[start:end], strain.value[start:end]
Let's look at the PSD for the first few segment, using psd
from matplotlib.mlab
.
import matplotlib.mlab as mp
def plotpsd(i):
segt, segd = segdata(i)
psd, freqs = mp.psd(segd,NFFT=2048,Fs=1.0/dt)
pp.loglog(freqs[1:],np.sqrt(psd[1:]))
pp.xlabel('$f/\mathrm{Hz}$'); pp.ylabel('$S(f)/\mathrm{Hz}^{1/2}$')
pp.axis([10,1000,1e-24,1e-14]);
pp.title('t = [%s,%s]' % (segt[0],segt[-1]))
print(len(segs))
pp.figure(figsize=(12,16))
for s in range(8):
pp.subplot(4,2,s+1)
plotpsd(s)
pp.tight_layout()
To see if something goes "ping", we obtain a spectrogram using matplotlib.mlab.specgram
. We examing segments of 4096 points = 1 sec.
We pick a prettier colormap than the default at http://matplotlib.org/examples/color/colormaps_reference.html
def plotspec(i):
segt, segd = segdata(i)
pxx, freqs, t = mp.specgram(segd,NFFT=4096,Fs=1.0/dt)
# normalize each frequency row by its median (effectively to whiten data)
pxx = np.array([row / np.median(row) for row in pxx])
pp.pcolormesh(t,freqs,np.log10(pxx),cmap=pp.get_cmap('Oranges'))
pp.xlabel('$t/\mathrm{s}$'); pp.ylabel('$S(f)/\mathrm{Hz}^{1/2}$')
pp.axis([0,t[-1],freqs[0],freqs[-1]])
pp.title('t = [%s,%s]' % (segt[0],segt[-1]))
pp.figure(figsize=(12,16))
for s in range(16,24):
pp.subplot(4,2,s-16+1)
plotspec(s)
pp.tight_layout()
Ta-dah! There's something going on in segment 18, between GPS times...
t0, t1 = segdata(18)[0][0], segdata(18)[0][-1]
print(t0,t1)
So we download a list of injections from https://losc.ligo.org/s/injections/burst/H1_cleanlog_burst.txt...
!wget https://losc.ligo.org/s/injections/burst/H1_cleanlog_burst.txt
!head -140 H1_cleanlog_burst.txt
Make a dictionary of injections indexed by their recorded time.
injections = {float(line.split()[0]): line for line in open('H1_cleanlog_burst.txt','r') if line[0] != '#'}
See if there are any in our window?
for t,line in injections.items():
if (t0 < t < t1):
print(line,end='')
Plot them on top of the spectrogram, with their nominal frequency (which must be the 3-4 digits in the string that begins wfsg
... sg
stands for sine-Gaussian, a simple burst/wavelet).
pp.figure(figsize=(12,8))
plotspec(18)
for t,line in injections.items():
if (t0 < t < t1):
pp.plot([t - t0,t - t0],[0,2048],'k',alpha=0.25)
freq = float(line.split()[2][4:-2])
pp.plot([t - t0 - 2,t - t0 + 2],[freq,freq],'k')
Oh yes!