Handling/DataMining/data_mining/Closest_Shot.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-


"""
===================================================================
Hledání nejbližšího výstřelu
===================================================================
"""

from scipy.stats.mstats import mquantiles

from numpy import *
#from matplotlib.pyplot import *
import sys, getopt, os, re
set_printoptions(precision=4,linewidth=400)
import urllib
from  scipy.stats import nanmedian
from pygolem_lite.sql import load_data
from pygolem_lite import get_current_shot

#./Closest_Shot.py --Ub=500 --Ucd=200 --Ubd=50 --Tbd=0  --Tcd=0.003 --gas_filling=1 --preionization=1 --pressure_request=20



def search_closest(user_vals, names, shots, data, N ):

    #print shape(user_vals), shape(shots), shape(data)

    # přenormovní dat
    M = mean(data,0)
    S = std(data,0)
    user_vals = (user_vals - M)/S
    data -= M
    data /= S
    
    # výpočet vzdálenosti
    distance = data.copy()
    distance -= user_vals
    dist = sqrt(nansum(distance**2,1))

    ind = argsort(dist)
    ind = ind[:N]   # vrátí první N
    dist =  dist[ind]

    return ind, dist

def download_data(shot):
    try:
	return loadtxt('loop_voltage_' + str(shot) )
    except:
	address="http://golem.fjfi.cvut.cz/utils/data/"+str(shot)+"/loop_voltage"
	print "stahuji", address
	data = urllib.urlopen(address)
	data = loadtxt(data)
	savetxt('loop_voltage_' + str(shot), data )
	return data



def prepare_data(diagns):
    """ load the data from disk and remove problematic shots """

    diagns_tmp =  diagns + ['plasma_status', 'plasma','plasma_life', 'loop_voltage_max','transformator_saturation','session_name','Ucd','Ucd','Ub']
    data_dict = load_data(diagns_tmp, arange(5000, get_current_shot()) )

    shots = data_dict['shot']

    # use only keys allowed in Vars
    names = array(diagns)
    X = array([ data_dict[i] for i in names ]).T
    N = len(X)


    # remove damaged  signal
    ind = array( [(True if re.match( 'OK',  data_dict['plasma_status'][i]) else False) for i in range(N)] )
    ind &= ~isnan(data_dict['plasma'])  & ~any(isnan(X) , axis = 1)
    # remove outling points (mistakes)
    ind &=  all(abs(X) <=  mquantiles(abs(X[ind,:]), 0.99, axis=0), axis=1)
    # remove outling points (mistakes)
    ind &= (data_dict['loop_voltage_max'] > 5) & (data_dict['loop_voltage_max'] < 40)
    # remove vacuum shots
    # too low Ucd or Ubd => again some error
    ind &= (data_dict['ucd'] > 10) & (data_dict['ub'] > 10)
    # too long plasma
    ind &= (data_dict['plasma_life'] < 17e-3 ) | isnan(data_dict['plasma_life'])
    # too saturated trafo => probably false detect
    ind &= ((data_dict['transformator_saturation'] < 0.8) & (data_dict['plasma'] == 1)) \
	    | (data_dict['plasma'] != 1) | isnan(data_dict['transformator_saturation'])
    # remove problematic sessions
    ind &= array( [ False if re.match('^Technological/(DAS|Technological|Problems|Software|Repairs)',
		    data_dict['session_name'][i]) else True  for i in range(N)] )

    return shots[ind], X[ind,:], names

def main():

    params = dict()
    for a in sys.argv[1:]:
	params[re.sub("(.+)=(.+)", r"\1", a).lower()] = double(re.sub("(.+)=(.+)", r"\2", a))
    close_shots = get_nearest(params)
    savetxt('closest_shots', close_shots, fmt="%i")


def get_nearest(params):

    shots, data, names = prepare_data( params.keys() )

    user_vals =  array([ d for d in params.itervalues() ]) 

    # ==================  set apriori knowlenge
    if not user_vals[names == 'gas_filling']:
	user_vals[names == 'pressure_request'] = nan
	user_vals[names == 'preionization'] = nan
	
    if user_vals[names == 'Ubd'] == 0:
	user_vals[names == 'Tbd'] = nan
	
    print "============================="
    for i in range(len(names)):
	print "Var: %s = %g " % ( names[i], user_vals[i])
    print "============================="


    # ============ find closest shot
    closest_ind, dist = search_closest(user_vals, names,  shots,  data.copy(), 5 )
    close_shots = shots[closest_ind]
    #close_dist = dist[closest_ind]

    # ================================


    print " closest shots ============================="
    for k,j in enumerate(closest_ind):
	print  "shots: %i    distance %g"% ( shots[j], dist[k])
	for i in range(len(names)):
	    print "     Var: %s = %g " % ( names[i], data[j,i])
    print "============================="

    return close_shots


if __name__ == "__main__":
    main()