edgecloudsim/scripts/sample_app5/ai_trainer/data_convertor.py

134 lines
4.9 KiB
Python

import pandas as pd
import json
import sys
if len (sys.argv) != 5:
print('invalid arguments. Usage:')
print('python data_conventor.py config.json [edge|cloud_rsu|cloud_gsm] [classifier|regression] [train|test]')
sys.exit(1)
with open(sys.argv[1]) as json_data_file:
data = json.load(json_data_file)
target = sys.argv[2]
method = sys.argv[3]
datatype = sys.argv[4]
print("conversion started with args " + target + ", " + method + ", " + datatype)
sim_result_folder = data["sim_result_folder"]
num_iterations = data["num_iterations"]
train_data_ratio = data["train_data_ratio"]
min_vehicle = data["min_vehicle"]
max_vehicle = data["max_vehicle"]
vehicle_step_size = data["vehicle_step_size"]
def getDecisionColumnName(target):
if target == "edge":
COLUMN_NAME = "EDGE"
elif target == "cloud_rsu":
COLUMN_NAME = "CLOUD_VIA_RSU"
elif target == "cloud_gsm":
COLUMN_NAME = "CLOUD_VIA_GSM"
return COLUMN_NAME
def getClassifierColumns(target):
if target == "edge":
result = ["NumOffloadedTask", "TaskLength", "WLANUploadDelay", "WLANDownloadDelay", "AvgEdgeUtilization", "Result"]
elif target == "cloud_rsu":
result = ["NumOffloadedTask", "WANUploadDelay", "WANDownloadDelay", "Result"]
elif target == "cloud_gsm":
result = ["NumOffloadedTask", "GSMUploadDelay", "GSMDownloadDelay", "Result"]
return result
def getRegressionColumns(target):
if target == "edge":
result = ["TaskLength", "AvgEdgeUtilization", "ServiceTime"]
elif target == "cloud_rsu":
result = ["TaskLength", "WANUploadDelay", "WANDownloadDelay", "ServiceTime"]
elif target == "cloud_gsm":
result = ["TaskLength", "GSMUploadDelay", "GSMDownloadDelay", "ServiceTime"]
return result
def znorm(column):
column = (column - column.mean()) / column.std()
return column
data_set = []
testDataStartIndex = (train_data_ratio * num_iterations) / 100
for ite in range(num_iterations):
for vehicle in range(min_vehicle, max_vehicle+1, vehicle_step_size):
if (datatype == "train" and ite < testDataStartIndex) or (datatype == "test" and ite >= testDataStartIndex):
file_name = sim_result_folder + "/ite" + str(ite + 1) + "/" + str(vehicle) + "_learnerOutputFile.cvs"
df = [pd.read_csv(file_name, na_values = "?", comment='\t', sep=",")]
df[0]['VehicleCount'] = vehicle
#print(file_name)
data_set += df
data_set = pd.concat(data_set, ignore_index=True)
data_set = data_set[data_set['Decision'] == getDecisionColumnName(target)]
if method == "classifier":
targetColumns = getClassifierColumns(target)
else:
targetColumns= getRegressionColumns(target)
if datatype == "train":
print ("##############################################################")
print ("Stats for " + target + " - " + method)
print ("Please use relevant information from below table in java side:")
train_stats = data_set[targetColumns].describe()
train_stats = train_stats.transpose()
print(train_stats)
print ("##############################################################")
#print("balancing " + target + " for " + method)
#BALANCE DATA SET
if method == "classifier":
df0 = data_set[data_set['Result']=="fail"]
df1 = data_set[data_set['Result']=="success"]
#size = min(len(df0[df0['VehicleCount']==max_vehicle]), len(df1[df1['VehicleCount']==min_vehicle]))
size = len(df0[df0['VehicleCount']==max_vehicle]) // 2
df1 = df1.groupby('VehicleCount').apply(lambda x: x if len(x) < size else x.sample(size))
df0 = df0.groupby('VehicleCount').apply(lambda x: x if len(x) < size else x.sample(size))
data_set = pd.concat([df0, df1], ignore_index=True)
else:
data_set = data_set[data_set['Result'] == 'success']
#size = min(len(data_set[data_set['VehicleCount']==min_vehicle]), len(data_set[data_set['VehicleCount']==max_vehicle]))
size = len(data_set[data_set['VehicleCount']==max_vehicle]) // 3
data_set = data_set.groupby('VehicleCount').apply(lambda x: x if len(x.index) < size else x.sample(size))
#EXTRACT RELATED ATTRIBUTES
df = pd.DataFrame(columns=targetColumns)
for column in targetColumns:
if column == 'Result' or column == 'ServiceTime':
df[column] = data_set[column]
else:
df[column] = znorm(data_set[column])
f = open(sim_result_folder + "/" + target + "_" + method + "_" + datatype + ".arff", 'w')
f.write('@relation ' + target + '\n\n')
for column in targetColumns:
if column == 'Result':
f.write('@attribute class {fail,success}\n')
else:
f.write('@attribute ' + column + ' REAL\n')
f.write('\n@data\n')
df.to_csv(f, header=False, index=False)
f.close()
print ("##############################################################")
print ("Operation completed!")
print (".arff file is generated for weka.")
print ("##############################################################")