SW/pygolem/download.py

#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
import sys,os, re

from numpy import *
#from collections import deque

#over proxy
#proxy_handler = urllib2.ProxyHandler({'http': '192.168.250.134:8888'})
#opener = urllib2.build_opener(proxy_handler)
#urllib2.install_opener(opener)

def main():
    """ A very simple data downloader that is using multithreading and direct access to saved files in order to reach maximal downloading speed
    """
    
    t = time.time()
    
    diagn = 'plasma_current'
    shots = range(10000,11000)
    results = ParallelDownload(diagn,shots,n_threats=10)
    
    print "saving data"
    for shot,d in results.iteritems():
	print shot
	savez_compressed(diagn + '_' + str(shot), **d )

    print  "mean time", (time.time()-t)/len(results)
    
    
    
    
class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue,output_queue,N):
	threading.Thread.__init__(self)
	self.queue = queue
	self.queue_out = output_queue
	self.N = N

    def run(self):
	while True:
	    n = self.queue_out.qsize()
	    sys.stdout.write('\rdownloaded: %3.0f %%' % (100*n/self.N))
	    sys.stdout.flush()
	   
	    #grabs host from queue
	    host,shot = self.queue.get()
	    d= DataSource()
	    try:
		f = d.open(host)
		f = load(f)
		self.queue_out.put((shot,f))
	    except IOError:
		print host+' was not found'
	    finally:
		self.queue.task_done()
	    


def ParallelDownload( diagn ,shots,n_threats=5):
    
    url_0 = get_address(diagn)

    start = time.time()

    links = [ url_0 % s for s in shots]

    queue = Queue.Queue()
    output_queue = Queue.Queue()


    
    if n_threats  > 10:
	print 'DDos attack :D, n_threats decreased to 10'
	n_threats = 10
	
    #spawn a pool of threads, and pass them queue instance 
    for i in range(n_threats):
	t = ThreadUrl(queue,output_queue,len(links))
	t.setDaemon(True)
	t.start()
	
    #populate queue with data   
    for host,shot in zip(links,shots):
	queue.put((host,shot))

    #wait on the queue until everything has been processed     
    queue.join()
    print "\nmean time: %2.3fs" % ((time.time() - start)/len(shots))
    
    
    results = {}
    while not output_queue.empty():
	shot,file = output_queue.get()
	results[shot] = file
	
    return results
    

def get_address(diagnostics):
    address = DataSource().open('http://golem.fjfi.cvut.cz/utils/data/0/'+diagnostics+'.link' ).read()
    ShotNo = int(DataSource().open('http://golem.fjfi.cvut.cz/shots/0/ShotNo').read())
    address = re.sub( r'/'+str(ShotNo)+'/', r'/%d/',  address)
    return address




main()