#!/usr/bin/env python
# coding: utf-8

# # Level 3 Data Conversion

# In[2]:


import pandas as pd
import time
import pathlib
import numpy as np


# In[3]:


def get_entries(startyear, rangeyear, startdoy, rangedoy):
    years = [i for i in range(startyear,startyear+rangeyear)]
    doys = [j for j in range(startdoy,startdoy+rangedoy)]
    #print(years,doys)
    for year in years:
        for doy in doys:
            name = str(int(year)%100).zfill(2) + str(doy).zfill(3)
            if 95<= int(year)%100 <=99:
                pha_path = pathlib.Path('/data/missions/soho/costep/level1/pha/' + str(year) + '/eph' + str(name) + '.pha')
                sci_path = pathlib.Path('/data/missions/soho/costep/level1/sci/' + str(year) + '/eph' + str(name) + '.sci')
            else:
                pha_path = pathlib.Path('/data/missions/soho/costep/level1/pha/' + str(year) + '/epi' + str(name) + '.pha')
                sci_path = pathlib.Path('/data/missions/soho/costep/level1/sci/' + str(year) + '/epi' + str(name) + '.sci')
            pha_paths.append(pha_path)
            sci_paths.append(sci_path)

    return pha_paths, sci_paths


# In[29]:


def load_data_local():
    #GET DATA LOCALLY
    names_pha = ['year', 'doy', 'msod', 'min since launch', 'coinc', 'a_seg', 'b_seg', 'A', 'A_flag', 'B', 'B_flag', 'C', 'C_flag', 'D','D_flag','E','E_flag','prio','status']
    names_sci = ['year','doy','msod','B0', 'P4GM', 'P4GR', 'P4S', 'P8GM', 'P8GR', 'P8S', 'H4GM', 'H4GR', 'H4S1', 'H4S23', 'H8GM', 'H8GR', 'H8S1', 'H8S23', 'E150', 'E300', 'E1300', 'E3000', 'INT', 'P25GM', 'P25GR', 'P25S', 'P41GM', 'P41GR', 'P41S', 'H25GM', 'H25GR', 'H25S1', 'H25S23', 'H41GM', 'H41GR', 'H41S1', 'H41S23'] 

    if int(name) == 96305 or int(name) == 17277: #do those seperately (bc there FME and FMD occur for the first time)
        return
    try:
        data_pha = pd.read_csv(pha_paths[i],sep=' ', names=names_pha).sort_values(['msod'],ascending=True)
        data_sci = pd.read_csv(sci_paths[i],sep=' ', usecols=[0,1,2,12, *range(22,55)], names=names_sci).sort_values(['msod'],ascending=True)
    except IOError:
            print('There was an error opening the file for ' + name)
    return data_pha, data_sci


# In[5]:


def complete_conversion(name, year, doy, data_pha, data_sci):
    if len(data_pha) != 0:
        #SET OVERFLOW FLAG
        data_pha = set_overflow_flag(data_pha)

        #REMOVE CONSECUTIVE DUPLICATES (MORE THAN 3 IN A ROW)
        data_pha = remove_duplicates(data_pha)
        
        #DELETE UNREALISTIC SEGMENTS IN DETECTORS A AND B (>5)
        data_pha = delete_segments(data_pha)
        
        #CALCULATE ENERGIES FROM PHAs
        data_pha = calc_energies(data_pha)

        #DAY, MONTH
        data_pha = data_pha.reset_index(drop=True)
        day, month = get_date(data_pha)
        data_pha['month'] = month
        data_pha['day'] = day

        #RING ON/OFF FLAG
        data_pha = ring_set(data_pha)

        #NUMBER COINCIDENCES IN TIMEFRAME + NUMBER PARTICLES WITH SAID COINCIDENCE IN SCI FILE
        if name == 96305 or name == 17277:
            data_pha = coinc_special(data_pha, data_sci)
        elif int(year) < 1996 or (int(year) == 1996 and int(doy) < 305): #NO FAILURE MODES!
            channels = ['E150', 'E300', 'E1300', 'E3000', 'P4', 'P8', 'P25', 'P41', 'H4', 'H8', 'H25', 'H41', 'INT'] 
        elif (int(year) == 1996 and int(doy) > 305) or 1996 < int(year) < 2017 or (int(year) == 2017 and int(doy) < 277): #FME
            channels = ['E150', 'E300', 'E1300', 'E1300', 'P4', 'P8', 'P25', 'P25', 'H4', 'H8', 'H25', 'H25', 'INT'] 
        else: #FMD and FME
            channels = ['E150', 'E1300', 'E1300', 'E1300', 'P4', 'P25', 'P25', 'P25', 'H4', 'H25', 'H25', 'H25', 'INT']
            nopha = [1,2,5,6,9,10] #coincidences where no pha words should occur
            data_pha = data_pha.drop(data_pha[data_pha['coinc'].isin(nopha)].index)
              
        data_pha = coinc_counts(data_pha, data_sci, channels)
        
        #ADD B0 COUNTS FROM SCI FILE
        data_pha = add_b0(data_pha, data_sci)
        
        #WRITE IN DATA FILES
        data_pha = write_data(name, year, doy, data_pha) #data_hsk
        write_overflow_file(name, year, data_pha)
    else:
        print("EMPTY FOR " + name)


# In[6]:


def get_date(data_pha):
	#GET DATE FROM DOY
	date = pd.to_datetime(data_pha.at[0,'doy']-1, unit='D', origin=str(data_pha.at[0,'year']))
	return date.day, date.month


# In[7]:


def delete_segments(data_pha):
	#DELETE SEGMENTS LARGER 5
	times = data_pha[(data_pha['a_seg']>5) | (data_pha['b_seg']>5)]['msod']
	data_pha = data_pha.drop(data_pha[data_pha['msod'].isin(times)].index) 
	return data_pha


# In[8]:


def remove_duplicates(data_pha):
	names_pha = ['year', 'doy', 'msod', 'min since launch', 'coinc', 'a_seg', 'b_seg', 'A', 'A_flag', 'B', 'B_flag', 'C', 'C_flag', 'D','D_flag','E','E_flag','prio','status','overflow']
	s = data_pha.diff().ne(0).cumsum()
	sizes = s.groupby(names_pha).size() > 3 
	sizes = sizes[sizes == True]
	if len(sizes) != 0:
		for i in sizes.index:
			index = s.loc[(s['year']==i[0])&(s['doy']==i[1])&(s['msod']==i[2])&(s['min since launch']==i[3])&(s['coinc']==i[4])&(s['a_seg']==i[5])&(s['b_seg']==i[6])&(s['A']==i[7])&(s['A_flag']==i[8])&(s['B']==i[9])&(s['B_flag']==i[10])&(s['C']==i[11])&(s['C_flag']==i[12])&(s['D']==i[13])&(s['D_flag']==i[14])&(s['E']==i[15])&(s['E_flag']==i[16])&(s['prio']==i[17])&(s['status']==i[18])&(s['overflow']==i[19])]
			overflow = data_pha.loc[index.index[0]].at['overflow']
			if overflow == 0:
				time = data_pha.loc[index.index[0]].at['msod']
				data_pha = data_pha.drop(data_pha[data_pha.msod == time].index)
				return data_pha

	else: return data_pha


# In[9]:


def set_overflow_flag(data_pha):
	#OVERFLOW
	data_pha['overflow'] = 0
	data_pha.loc[data_pha['A'] == 1023,'of_A'] = 10000
	data_pha.loc[data_pha['B'] == 1023,'of_B'] = 2000
	data_pha.loc[data_pha['C'] == 1023,'of_C'] = 300
	data_pha.loc[data_pha['D'] == 1023,'of_D'] = 40
	data_pha.loc[data_pha['E'] == 1023,'of_E'] = 5

	data_pha['overflow'] = data_pha['overflow'] + data_pha['of_A'].fillna(0) + data_pha['of_B'].fillna(0) + data_pha['of_C'].fillna(0) + data_pha['of_D'].fillna(0) + data_pha['of_E'].fillna(0)
	
	del data_pha['of_A']
	del data_pha['of_B']
	del data_pha['of_C']
	del data_pha['of_D']
	del data_pha['of_E']

	data_pha['overflow'] = data_pha['overflow'].astype(int)
	return data_pha


# In[10]:


def calc_energies(data_pha):
	#ENERGIES
	det = ['A','B','C','D','E']
	if test:
		scales = [[3,3,16.07,20,20],[30,45,166.7,219.7,219.7]]
	else:
		scales = [[3.153,3.067,15.321,20.762,22.387],[31.53,46.01,158.93,233.57,251.85]]
	
	for i in range(len(det)):
		
		low = data_pha[data_pha[det[i]+'_flag']==0][det[i]]*scales[0][i]/1.023
		high = data_pha[data_pha[det[i]+'_flag']==1][det[i]]*scales[1][i]/1.023
		data_pha['low'] =low
		data_pha['high']=high
		data_pha[det[i]]=data_pha[['low','high']].sum(1).round(decimals=2) 
		del data_pha['low']
		del data_pha['high']
	
		FS = data_pha[det[i]+'_flag']==1
		data_pha[det[i]+'_flag'] = pd.Series(scales[j][i]*1000 for j in FS.astype(int))
		data_pha.rename(columns = {det[i]+'_flag':'FS_'+det[i]}, inplace = True)
	#print(data_pha)
	
	return data_pha


# In[11]:


def ring_set(data_pha):
	#RING ON/OFF
	data_pha['ring'] = data_pha.status & 2 != 0
	data_pha['ring'] = data_pha['ring'].astype(int)
	
	return data_pha


# In[12]:


def coinc_counts(data_pha, data_sci, channels):
    #COMBINE DIRECTIONS PROTON AND HELIUM CHANNELS TO ONE CHANNEL
    data_sci['P4'] = data_sci['P4GM'] + data_sci['P4GR'] + data_sci['P4S']
    data_sci['H4'] = data_sci['H4GM'] + data_sci['H4GR'] + data_sci['H4S1'] + data_sci['H4S23']
    data_sci['P8'] = data_sci['P8GM'] + data_sci['P8GR'] + data_sci['P8S']
    data_sci['H8'] = data_sci['H8GM'] + data_sci['H8GR'] + data_sci['H8S1'] + data_sci['H8S23']
    data_sci['P25'] = data_sci['P25GM'] + data_sci['P25GR'] + data_sci['P25S']
    data_sci['H25'] = data_sci['H25GM'] + data_sci['H25GR'] + data_sci['H25S1'] + data_sci['H25S23']
    data_sci['P41'] = data_sci['P41GM'] + data_sci['P41GR'] + data_sci['P41S']
    data_sci['H41'] = data_sci['H41GM'] + data_sci['H41GR'] + data_sci['H41S1'] + data_sci['H41S23']

    #FOR COUNTS OF PHA-WORDS: GROUPBY MSOD AND COINC, CALCULATE SIZE, WRITE IN ORIGINAL DATAFRAME
    data_grouped = data_pha.groupby(['msod','coinc']).size()
    data_grouped = data_grouped.reset_index()
    data_grouped = data_grouped.rename(columns={'0': 'coinc_pha'})
    data_grouped.columns = ['msod','coinc','coinc_pha']
    data_pha = pd.merge(data_grouped, data_pha, on=['msod','coinc'], how = 'right')
    data_pha = data_pha.rename(columns={'size':'coinc_pha'})

    #FOR COUNTS IN SCI-FILE: MERGE SCI FILES WITH PHA FILES, WRITE CORRESPONDING COINCIDENCE COUNT IN 'COINC_SCI' COLUMN IN ORIGINAL DATAFRAME
    data_pha = pd.merge(data_sci[['year','doy','msod','E150','E300','E1300','E3000','P4','P8','P25','P41','H4','H8','H25','H41','INT']], data_pha, on=['year','doy','msod'], how = 'right')
    data_pha['coinc_sci'] = 0
    for i in range(len(channels)):
        data_pha.loc[data_pha.coinc == i,'coinc_sci'] = data_pha[channels[i]]
    
    #IN CASE THERE ARE MISSING TIMES IN SCI-FILE -> NaN WILL BE PUT INTO PLACE (MUST BE DELETED)
    data_pha = data_pha.dropna(how='any')
    
    #IN CASE FOR SOME REASON, FLOATS ARE CREATED
    data_pha['coinc_sci'] = data_pha['coinc_sci'].astype(int)
    data_pha['coinc_pha'] = data_pha['coinc_pha'].astype(int)
    
    return data_pha


# In[23]:


def add_b0(data_pha, data_sci):
    data_pha = pd.merge(data_sci[['year','doy','msod','B0']], data_pha, on=['year','doy','msod'], how = 'right')
    return data_pha


# In[14]:


def coinc_special(data_pha, data_sci):
    #TO BE WRITTEN
    print("SPECIAL")
    return data_pha


# In[15]:


def write_data(name, year, doy, data_pha):
    #REINDEX
    data_pha = data_pha.reindex(columns = ['year','month','day','doy','msod','coinc','a_seg','b_seg','prio','A','FS_A','B','FS_B','C','FS_C','D','FS_D','E','FS_E','status','ring','coinc_sci','coinc_pha','overflow'])

    #WRITE IN FILE
    dataname = 'eph'+str(name)+'.l3pha'
    if test:
        #ENTER YOUR WANTED PATH HERE
        path = '/home/asterix/martens/Desktop/EPHIN/data_val/' + dataname
        #data_pha = data_pha.drop(data_pha[data_pha['coinc'] == 12].index)
    else:
        path = '/data/projects/soho/ephin/level3/janna/' + year + '/' + dataname

    data_pha.to_csv(path, index = None, header = None, sep = ' ')
    print("Done with " + str(name))

    return data_pha


# In[16]:


def write_overflow_file(name, year, data_pha):
    #ONLY WRITE OVERFLOW FILE IF IT IS NOT A TEST (BC MODE = 'A')
    if test:
        return
    path = '/data/projects/soho/ephin/level3/janna/overflow.l3pha' 
    data_pha = data_pha[data_pha['overflow'] !=0]
    data_pha.to_csv(path, index = None, header = None, sep = ' ', mode = 'a')


# ### Enter your wanted range of data here

# In[25]:


startyear = 1995
rangeyear = 1 #range of 1 equals just the startyear
startdoy = 1
rangedoy = 370

#if test: Sierks conversion scales, no overflow file written, other path
test = False


# ### Run this cell to convert data

# In[30]:


#start measuring computing time
startTime = time.time()

pha_paths = []
sci_paths = []
pha_paths, sci_paths = get_entries(startyear,rangeyear,startdoy,rangedoy)

for i in range(len(pha_paths)):
    year = pha_paths[i].parent.parts[7]
    name = pha_paths[i].stem[3:]
    doy = name[2:5]
    try:
        data_pha, data_sci = load_data_local()
    except UnboundLocalError:
        continue
        
    complete_conversion(name, year, doy, data_pha, data_sci)
    
#check how long that took
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime)) 


# In[ ]:




