import numpy as np
import math
import os.path
from pathlib import Path
import glob
from tensorflow.keras.utils import Sequence
FIXED_PACKET_SIZE = 1500
NUM_OF_PACKETS_PER_FILE = 16
RESCALE_FACTOR = 1./255
# v2ray traffic tag
TRAINING_DATA_PERCENTAGE = 0.8
PACKET_FILE_EXT = '*.bin'
def rglob(data_root, file_ext):
files = list()
for filePath in Path(data_root).rglob(file_ext):
files.append(str(filePath))
return files
def binary_classification(packet_path, match_string=V2RAY_HOST_TAG):
"""Binary network traffic classification function
:param packet_path: file path to packet
:param match_string:
:return: 1, if it is v2ray traffice. 0, otherwise.
"""
if packet_path.find(match_string) != -1:
return 1
else:
return 0
def generate_train_validation_packet_path_list(data_root, training_pct=TRAINING_DATA_PERCENTAGE, eqaul_size=True):
file_list = rglob(data_root, PACKET_FILE_EXT)
v2ray_file_list = [file_path for file_path in file_list if binary_classification(file_path) == 1]
non_v2ray_file_list = [file_path for file_path in file_list if binary_classification(file_path) == 0]
if eqaul_size:
cut_off_count = min(len(v2ray_file_list), len(non_v2ray_file_list))
v2ray_file_size = cut_off_count
non_v2ray_file_size = cut_off_count
else:
v2ray_file_size = len(v2ray_file_list)
non_v2ray_file_size = len(non_v2ray_file_list)
v2ray_indexes = np.arange(len(v2ray_file_list))
np.random.shuffle(v2ray_indexes)
non_v2ray_indexes = np.arange(len(non_v2ray_file_list))
np.random.shuffle(non_v2ray_indexes)
training_file_list = [v2ray_file_list[index]
for index in v2ray_indexes[:math.ceil(v2ray_file_size * training_pct)]] + \
[non_v2ray_file_list[index]
for index in non_v2ray_indexes[:math.ceil(non_v2ray_file_size * training_pct)]]
validation_file_list = [v2ray_file_list[index]
for index in v2ray_indexes[math.ceil(v2ray_file_size * training_pct): v2ray_file_size]] + \
[non_v2ray_file_list[index]
for index in non_v2ray_indexes[math.ceil(non_v2ray_file_size * training_pct): non_v2ray_file_size]]
print("Statistics: ")
print("Total V2ray traffic %d, Total non-V2ray traffic %d" % (len(v2ray_file_list), len(non_v2ray_file_list)))
print("Output train traffic %d, Total validation traffic %d" % (len(training_file_list), len(validation_file_list)))
return training_file_list, validation_file_list
# Generate training data and validation file list
train_file_list, val_file_list = generate_train_validation_packet_path_list(data_root=DATA_ROOT, eqaul_size=True)