#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, getopt, os
from numpy import *
from numpy.matlib import repmat
from matplotlib.pyplot import *
from sklearn import svm, grid_search, metrics
from sklearn.cross_validation import StratifiedKFold
from copy import deepcopy
from scipy.stats.mstats import mquantiles
import re
from data_object import Data
set_printoptions(precision=4,linewidth=500)
# possible agruments
ARGS = array([
#'Tb',
'pressure',
'Tcd',
'Ub',
'Ucd'])
"""
PREDICTION OF BREAKDOWN FOR TOKAMAK GOLEM
Example of use
./predict_breakdown.py --train --plot
./predict_breakdown.py --test --Ub=400 --Ucd=500 --pressure=27.84 --Tcd=0.01
./predict_breakdown.py --outliers
"""
def use():
print 'availible parameters:'
print ARGS
print "./predict_breakdown.py --train --plot"
print "./predict_breakdown.py --test --Ub=400 --Ucd=500 --pressure=27.84 --Tcd=0.01"
print "./predict_breakdown.py --outliers"
#--gas_filling=1 --Ubd=100 --Ust=0 --Tbd=0.01 --Tst=0 --PreIonization=1
def main():
""" load user setting from command line """
params = dict()
for i in ARGS:
params[i] = ""
vals=[(i+'=') for i in ARGS]
for i in ['train', 'test', 'outliers', 'plot', 'help']:
vals.append(i)
try:
opts, args = getopt.getopt(sys.argv[1:], "h", vals )
except Exception, err:
# print help information and exit:
print str(err)
use()
exit()
for o, a in opts:
for i in range(len(ARGS)):
if o in ('--'+ARGS[i]):
params[ARGS[i]] = a
for o, a in opts:
if o in ("--help"):
use()
return
if o in ("--train"):
data_train = load_data('data_breakdown.npz', balance_classed = True)
data_test = load_data('data_breakdown.npz', balance_classed = False)
train(data_train,data_test )
elif o in ("--test"):
machine = load('model.npy').item()
data = load_data('data_breakdown.npz')
if 'gas_filling' in params and int(params['gas_filling']) == 0:
print "probability ",0
return 0
X = array([ params[name] for name in data.names ]) # ignore the first value (gas_filling)
if X.dtype != float:
X[X == ""] = data.get_norm()[0,X == ""] # replace missing values by median in the dimension
X =reshape(double(X), (1,-1))
print "testing"
###############################################
proba = test(X, data, machine)[0]
###############################################
elif o in ("--outliers"):
machine = load('model.npy').item()
data = load_data('data_breakdown.npz')
outliers(data, machine)
elif o in ("--plot"):
machine = load('model.npy').item()
data = load_data('data_breakdown.npz')
plot_data(data, machine)
if len(opts) == 0:
use()
exit()
def load_data(path, balance_classed = True):
""" load the data from disk and remove problematic shots """
d = load(path)
shots = d['shots']
data_dict = d['data'].item()
Y = data_dict['plasma']
Ucd = data_dict['Ucd']
# use only keys allowed in ARGS
remove_keys = array(data_dict.keys())[~in1d(data_dict.keys(), ARGS)]
keys = [ i for i in data_dict.keys() if i not in remove_keys ]
X = array([ data_dict[i] for i in keys ]).T
N = len(X)
# remove damaged signal
ind = array( [(True if re.match( 'OK', data_dict['plasma_status'][i]) else False) for i in range(N)] )
ind &= ~isnan(Y) & ~any(isnan(X) , axis = 1)
# remove outling points (mistakes)
ind &= all(abs(X) <= mquantiles(abs(X[ind,:]), 0.99, axis=0), axis=1)
# remove outling points (mistakes)
ind &= (data_dict['loop_voltage_max'] > 5) & (data_dict['loop_voltage_max'] < 40)
# remove vacuum shots
ind &= data_dict['gas_filling'] == 1
# too low Ucd or Ubd => again some error
ind &= (data_dict['Ucd'] > 10) & (data_dict['Ub'] > 10)
# too long plasma
ind &= (data_dict['plasma_life'] < 17e-3 ) | isnan(data_dict['plasma_life'])
# too saturated trafo => probably false detect
ind &= ((data_dict['transformator_saturation'] < 0.8) & (data_dict['plasma'] == 1)) \
| (data_dict['plasma'] != 1) | isnan(data_dict['transformator_saturation'])
# remove problematic sessions
ind &= array( [ False if re.match('^Technological/(DAS|Technological|Problems|Software|Repairs)',
data_dict['session_name'][i]) else True for i in range(N)] )
data = Data(X[ind,:],Y[ind],shots[ind], keys , [])
# remove too many succefull breakdowns => now there should by only 3x more breakdowns than failures !!
if balance_classed:
ind = (data.Y == 1) & (mod( arange(data.N) , int((0.3/(1-mean(data.Y))))) != 0)
data.remove_data(ind)
return data
def artif_data(data):
""" add artificial data to the clear areas without breakdown => apriory knowledge """
Nvals = 100 # number of the artificial points
Ndim = data.Ndim
MIN = zeros(Ndim)
MAX = zeros(Ndim)
for i in range(Ndim):
MIN[i] = mquantiles(data.X[:,i], 0.01)
MAX[i] = mquantiles(data.X[:,i], 0.99)
def random_data():
R = random.rand(Nvals, Ndim)
R *= (MAX-MIN)
R += MIN
return R
RAN_B = random_data()
RAN_B[:,data.ind('Bfield')] = 0
RAN_P_min = random_data()
# no filling gas
RAN_P_min[:,data.ind('pressure')] = random.random(Nvals)*1
RAN_CD = random_data()
RAN_CD[:,data.ind('Ucd')] = random.random(Nvals)*10
X = vstack([RAN_B, RAN_P_min, RAN_CD])
Y = zeros(len(X)) # no breakdown
return data
def prepare_data(data, allow_artif_data):
""" add some more apriory knowledge, => remove strange shots, recalculate the variables (approximately), remove unomportant dimensions, ... """
ShotNo = data.shots
names = data.names
#data.X[:,data.ind('Tcd')] = data['Tbd']-data['Tcd'] #the most important is the difference CD a BD
# perfrom the logaritmic substitution !! => less than 2.5 is useless (vaccum)
ind = data.ind('pressure')
data.X[data['pressure'] < 2.5, ind] = 3
data.X[:,ind] = log(data['pressure']-2) #tlak
#============== relative calibration of the capacitors for different shots =================== % 0.02 0.08
C = ones(data.N) * 3.9
try:
C[(ShotNo <= 1468) ] = 0.6
C[((ShotNo > 1468) & (ShotNo<= 2876)) ] = 2
C[((ShotNo > 2876) & (ShotNo<= 2918)) ] = 3
C[((ShotNo > 2918) & (ShotNo<= 3305)) ] = 4.2
C[((ShotNo > 3305) & (ShotNo<= 9836)) ] = 3.9
C[((ShotNo > 3305) & (ShotNo<= 9836)) ] = 3.9
C[(ShotNo > 9836) ] = 16
except:
pass
# approximate the magnetic field in the time of maximal breakdown field
data.X[:,data.ind('Ub')] = data['Ub'] *sqrt(C)* sin(255*1e-6*1/sqrt(C) * data['Tcd'])
data.names[data.ind('Ub')] = 'Bfield' # renames the variable ...
if allow_artif_data: # create artificial data in certain areas
data = artif_data(data)
if len(data.norm) == 0:
data.norm = data.get_norm()
data.norm[[1,2], data.ind('Bfield')] *= 3 # allow fasted changes in this dimension
data.normalize(data.norm)
return data
def train(data, data_test):
"""
Call SVM algorithm to make probability prediction of breakdown
"""
print 'Learning'
data = prepare_data(data ,True)
data_test = prepare_data(data_test ,False)
# range of the training prameters for SVM
crange = linspace(10,15,4)
grange = linspace(-15,-7,4)
parameters = {'C':2.0**crange, 'gamma':2.0**grange}
print 'start grid'
print "N points", data.N
print "N dim", data.Ndim
#print data.X, data.Y
machine = svm.SVC(kernel = 'rbf', class_weight ="auto",
cache_size = 800, verbose=False, tol=1e-2 , shrinking=True ) # gaussion kernel
machine = grid_search.GridSearchCV( machine , parameters, n_jobs=-1)
print "Solving ...."
machine.fit( data.X, data.Y, cv=StratifiedKFold(data.Y, 5)) # find the optimal parameters
#save('model_all', machine)
best = svm.SVC(probability=True,C = machine.best_estimator_.C , kernel = 'rbf', gamma = machine.best_estimator_.gamma)
print "=====BEST PARAMETERS========= "
print "C", machine.best_estimator_.C
print "gamma", machine.best_estimator_.gamma
print "=============================="
best.fit(data.X, data.Y)
####################x report #######################
print 'all data'
y_pred = best.predict(data_test.X)
print metrics.classification_report(data_test.Y, y_pred)
save('model', best)
# percents of wrong predict
print "zero-one loss function - train set - %2.1f%%" % (mean(data.Y != best.predict(data.X))*100)
print "zero-one loss function - test set - %2.1f%%" % (mean(data_test.Y != best.predict(data_test.X))*100)
print 'number of SV ', shape(best.support_vectors_)
def test(X, data, machine):
"""
get breakdown probability for the input vector X
class data is used only for normalization, machine contains class of the SVM learning predictor
"""
names = data.names
data = prepare_data(deepcopy(data) ,False) # preprocess data => to get norm of final dataset
data_new = Data(X,[nan],[nan], names, data.norm)
data_new = prepare_data(data_new ,False) # preprocess userselected variables with the same norm
print 'predict'
Y = machine.predict_proba(data_new.X)
dist = machine.decision_function(data_new.X)
probability = int(Y[0,1]*100)
print "breakdown ", bool(machine.predict(data_new.X)),'<= probability of breakdown' , probability, '%'
#print "distance from boundary", double(dist)
return probability, dist
def outliers(data, machine):
data = prepare_data(data ,False)
proba = machine.predict_proba(data.X)[:,1]
print 'expected breakdown but failed #shots:'
print int_(data.shots[(proba > 0.90) & (data.Y == 0)])
print 'unexpected breakdown #shots:'
print int_(data.shots[(proba < 0.2) & (data.Y == 1)])
def plot_data(data, machine):
"""
Plot 2-dimensinal cut through 4-imensional space
"""
med_orig = median(data.X,0)
vars = [ 'pressure', 'Ucd','Ub' ]
const = ARGS[~in1d(vars, ARGS)] # the rest of variables will lbe constant
for i in linspace(amin(data[vars[2]]), amax(data[vars[2]]), 20):
plotting(data,const,vars, i, machine )
def plotting(data, const,vars,zvar, machine):
""" plot cuts through the mutlidimensional space
vars are names of variables in x,y,z axis
zvar is value of variable in Z axis
other variables (hidden) are selected to be equal to mean values
machine - machine learning object
const = variable
"""
print "plotting", vars, zvar
max_distance = 0.5 # number changing maximal distance to plot the points
X = deepcopy(data.X)
MAD = mean(abs(X - mean(X,0)),0) # mean absolute deviation
MEAN = mean(data.X,0)
MEAN[data.ind(vars[2])] = zvar #
Xaxe = linspace(amin(data[vars[0]]), amax(data[vars[0]]), 100)
Yaxe = linspace(amin(data[vars[1]]), amax(data[vars[1]]), 100)
xx, yy = meshgrid(Xaxe, Yaxe)
data_grid = deepcopy(data)
data_norm = prepare_data( deepcopy(data) ,False) # data need to be normalized to the same scale as trainning set
N_grid = size(xx)
data_grid.X = repmat(MEAN,N_grid,1)
data_grid.Y = ones(N_grid)*nan
data_grid.shots = ones(N_grid)*nan
data_grid.N = N_grid
data_grid.norm = data_norm.norm
# inplace variable in X and Y axis !!
data_grid.X[:, data_grid.ind(vars[0])] = xx.ravel()
data_grid.X[:, data_grid.ind(vars[1])] = yy.ravel()
data_grid = prepare_data(data_grid, False)
proba = machine.predict_proba(data_grid.X)[:,1]
proba = proba.reshape(xx.shape)
N = data.N
Ndim = data_grid.Ndim
# find shots close to the cuts
ind = bool_(N)
const_dim = where(in1d(data.names, const) | in1d(data.names, [vars[2]]))[0]
for i in const_dim:
ind = ind*((abs(data.X[:,i] - MEAN[i])/MAD[i])<max_distance)
sim_data = deepcopy(data)
sim_data.remove_data(~ind)
# plot the probability
proba[0,0] = 1
proba[-1,-1] = 0
contourf(xx, yy, proba, 200)
cb = colorbar()
cb.ax.set_ylabel('Probability of breakdown')
contour(xx, yy, proba-0.8, 1, linestyles='dotted')
contour(xx, yy, proba-0.2, 1, linestyles='dotted')
contour(xx, yy, proba-0.5, 1, linewidths=2)
axis([amin(Xaxe), amax(Xaxe), amin(Yaxe), amax(Yaxe)])
if sim_data.N > 0:
X = sim_data.X[:, [data.ind(vars[0]),data.ind(vars[1])]]
i = 0
for i in range(2):
X[:,i] += 0.005*(amax(X[:,i]) - amin(X[:,i]))*squeeze(random.randn(sim_data.N,1))
Y = sim_data.Y
Y = Y[:,None].repeat(3, axis=1) # represents now RGB color
scatter(X[:,0], X[:,1], 20 , Y, edgecolors='none', alpha=0.8 )
title('Probability prediction of breakdown %s = %g' % (vars[2], zvar) )
xlabel(vars[0])
ylabel(vars[1])
savefig('graf_%s_%05d.png' % (vars[2], zvar))
clf()
if __name__ == "__main__":
main()