134 lines
4.9 KiB
Python
134 lines
4.9 KiB
Python
import pandas as pd
|
|
import json
|
|
import sys
|
|
|
|
if len (sys.argv) != 5:
|
|
print('invalid arguments. Usage:')
|
|
print('python data_conventor.py config.json [edge|cloud_rsu|cloud_gsm] [classifier|regression] [train|test]')
|
|
sys.exit(1)
|
|
|
|
with open(sys.argv[1]) as json_data_file:
|
|
data = json.load(json_data_file)
|
|
|
|
target = sys.argv[2]
|
|
method = sys.argv[3]
|
|
datatype = sys.argv[4]
|
|
|
|
print("conversion started with args " + target + ", " + method + ", " + datatype)
|
|
|
|
sim_result_folder = data["sim_result_folder"]
|
|
num_iterations = data["num_iterations"]
|
|
train_data_ratio = data["train_data_ratio"]
|
|
min_vehicle = data["min_vehicle"]
|
|
max_vehicle = data["max_vehicle"]
|
|
vehicle_step_size = data["vehicle_step_size"]
|
|
|
|
def getDecisionColumnName(target):
|
|
if target == "edge":
|
|
COLUMN_NAME = "EDGE"
|
|
elif target == "cloud_rsu":
|
|
COLUMN_NAME = "CLOUD_VIA_RSU"
|
|
elif target == "cloud_gsm":
|
|
COLUMN_NAME = "CLOUD_VIA_GSM"
|
|
return COLUMN_NAME
|
|
|
|
def getClassifierColumns(target):
|
|
if target == "edge":
|
|
result = ["NumOffloadedTask", "TaskLength", "WLANUploadDelay", "WLANDownloadDelay", "AvgEdgeUtilization", "Result"]
|
|
elif target == "cloud_rsu":
|
|
result = ["NumOffloadedTask", "WANUploadDelay", "WANDownloadDelay", "Result"]
|
|
elif target == "cloud_gsm":
|
|
result = ["NumOffloadedTask", "GSMUploadDelay", "GSMDownloadDelay", "Result"]
|
|
return result
|
|
|
|
def getRegressionColumns(target):
|
|
if target == "edge":
|
|
result = ["TaskLength", "AvgEdgeUtilization", "ServiceTime"]
|
|
elif target == "cloud_rsu":
|
|
result = ["TaskLength", "WANUploadDelay", "WANDownloadDelay", "ServiceTime"]
|
|
elif target == "cloud_gsm":
|
|
result = ["TaskLength", "GSMUploadDelay", "GSMDownloadDelay", "ServiceTime"]
|
|
return result
|
|
|
|
def znorm(column):
|
|
column = (column - column.mean()) / column.std()
|
|
return column
|
|
|
|
data_set = []
|
|
|
|
testDataStartIndex = (train_data_ratio * num_iterations) / 100
|
|
|
|
for ite in range(num_iterations):
|
|
for vehicle in range(min_vehicle, max_vehicle+1, vehicle_step_size):
|
|
if (datatype == "train" and ite < testDataStartIndex) or (datatype == "test" and ite >= testDataStartIndex):
|
|
file_name = sim_result_folder + "/ite" + str(ite + 1) + "/" + str(vehicle) + "_learnerOutputFile.cvs"
|
|
df = [pd.read_csv(file_name, na_values = "?", comment='\t', sep=",")]
|
|
df[0]['VehicleCount'] = vehicle
|
|
#print(file_name)
|
|
data_set += df
|
|
|
|
data_set = pd.concat(data_set, ignore_index=True)
|
|
data_set = data_set[data_set['Decision'] == getDecisionColumnName(target)]
|
|
|
|
if method == "classifier":
|
|
targetColumns = getClassifierColumns(target)
|
|
else:
|
|
targetColumns= getRegressionColumns(target)
|
|
|
|
if datatype == "train":
|
|
print ("##############################################################")
|
|
print ("Stats for " + target + " - " + method)
|
|
print ("Please use relevant information from below table in java side:")
|
|
train_stats = data_set[targetColumns].describe()
|
|
train_stats = train_stats.transpose()
|
|
print(train_stats)
|
|
print ("##############################################################")
|
|
|
|
#print("balancing " + target + " for " + method)
|
|
|
|
#BALANCE DATA SET
|
|
if method == "classifier":
|
|
df0 = data_set[data_set['Result']=="fail"]
|
|
df1 = data_set[data_set['Result']=="success"]
|
|
|
|
#size = min(len(df0[df0['VehicleCount']==max_vehicle]), len(df1[df1['VehicleCount']==min_vehicle]))
|
|
|
|
size = len(df0[df0['VehicleCount']==max_vehicle]) // 2
|
|
|
|
df1 = df1.groupby('VehicleCount').apply(lambda x: x if len(x) < size else x.sample(size))
|
|
df0 = df0.groupby('VehicleCount').apply(lambda x: x if len(x) < size else x.sample(size))
|
|
|
|
data_set = pd.concat([df0, df1], ignore_index=True)
|
|
else:
|
|
data_set = data_set[data_set['Result'] == 'success']
|
|
|
|
#size = min(len(data_set[data_set['VehicleCount']==min_vehicle]), len(data_set[data_set['VehicleCount']==max_vehicle]))
|
|
|
|
size = len(data_set[data_set['VehicleCount']==max_vehicle]) // 3
|
|
data_set = data_set.groupby('VehicleCount').apply(lambda x: x if len(x.index) < size else x.sample(size))
|
|
|
|
#EXTRACT RELATED ATTRIBUTES
|
|
df = pd.DataFrame(columns=targetColumns)
|
|
for column in targetColumns:
|
|
if column == 'Result' or column == 'ServiceTime':
|
|
df[column] = data_set[column]
|
|
else:
|
|
df[column] = znorm(data_set[column])
|
|
|
|
f = open(sim_result_folder + "/" + target + "_" + method + "_" + datatype + ".arff", 'w')
|
|
f.write('@relation ' + target + '\n\n')
|
|
for column in targetColumns:
|
|
if column == 'Result':
|
|
f.write('@attribute class {fail,success}\n')
|
|
else:
|
|
f.write('@attribute ' + column + ' REAL\n')
|
|
f.write('\n@data\n')
|
|
df.to_csv(f, header=False, index=False)
|
|
f.close()
|
|
|
|
print ("##############################################################")
|
|
print ("Operation completed!")
|
|
print (".arff file is generated for weka.")
|
|
print ("##############################################################")
|
|
|