import pandas as pd
import numpy as np
import time
import networkx as nx
import collections
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')
df = pd.read_csv("proximity.txt"," ")
print(df.head(5))
df.index = pd.DatetimeIndex(df.date_time)
df = df.drop(columns=['date_time',"json_name","apiDateTime"])
df = df.rename(columns={"hashed_sta_eth_mac": "hashedmac"})
#Number of different hashed macs in total
(unique, counts) = np.unique(df.hashedmac, return_counts=True)
print("Total Hashed Mac Samples: %i with unique elements: %i - diff. %i"%(df.shape[0],len(unique),df.shape[0]-len(unique)))
df["one"] = np.ones(df.shape[0])
print(df.head(5))
#Samples by day
df.resample("B")["one"].sum().plot(kind="bar")
plt.show()
Some observations:
firstday = df.index.min()
fdy = firstday.strftime("%m-%d-%Y")
ldy = df.index.max()
# pd.date_range(fdy,ldy)
macs = df[fdy].groupby("hashedmac")["one"].sum()
dfmacs = df[fdy]
dfmacs = dfmacs[dfmacs.hashedmac.isin(macs.index)]
# In lmacs, we have users' handovers
lmacs = dfmacs.groupby("hashedmac")["ap_name"].apply(list)
print(dfmacs.head(3))
print("*"*40)
print(lmacs.head(3))
userHands = np.array([len(x) for x in lmacs.values])
(unique, counts) = np.unique(userHands, return_counts=True)
print("Mean number of handovers: ", np.mean(userHands))
print("Std handovers: ", np.std(userHands))
print("Median handovers: ", np.median(userHands))
#Histogram of users' handover
fig, ax = plt.subplots()
ax.bar(unique,counts)
plt.show()
#zoom in
fig, ax = plt.subplots()
ax.bar(unique[1:],counts[1:])
plt.show()
#1
x = np.sort(userHands)
y = np.array(range(len(userHands)))/float(len(userHands))
fig, ax = plt.subplots()
ax.set_xlim(0,50)
ax.plot(x,y)
plt.title('Cumulative distribution function')
ax.set_xlabel('number of handovers')
ax.set_ylabel('$p$')
pstwo = np.where(x==2)[0][0]
ax.axhline(y=y[pstwo], linestyle='dashed', alpha=0.5,color='#004f6d')
ax.text(x=x[pstwo], y=y[pstwo]-0.05, s="%.1f%%"%(y[pstwo]*100.0), alpha=0.7, color='#004f6d')
plt.xticks(sorted(list(plt.xticks()[0]) + [x[pstwo]]))
pseightty= np.where(y>0.8)[0][0]
ax.axvline(x=x[pseightty], linestyle='dashed', alpha=0.5,color='#11008d')
ax.text(x=x[pseightty]-4, y=y[pseightty], s='80%', alpha=0.7, color='#11008d')
plt.xticks(sorted(list(plt.xticks()[0]) + [x[pseightty]]))
plt.show()
nodes,edges = [],[]
for aps in lmacs.values:
nodes=nodes+aps
c = 0
i = len(aps)
while c+1<i:
edges.append((aps[c],aps[c+1]))
c+=1
if c==0: edges.append((aps[c],aps[c]))
nodes = list(set(nodes)) #remove duplicate
# We get the acronym of the building with the AP name.
attr_building={}
for ap in nodes:
name = ap.split("-")[1]
attr_building[ap] = name
names_building = list(set(attr_building.values()))
#edges by building
eall = [(x.split("-")[1],y.split("-")[1]) for (x,y) in edges if x.split("-")[1]!=y.split("-")[1]]
attr_edges = dict(collections.Counter(eall))
#Data preparation for Holoview model.
source,target,value=[],[],[]
for item in attr_edges.items():
source.append(item[0][0])
target.append(item[0][1])
value.append(item[1])
edgeBuilding = pd.DataFrame({"source":source,"target":target,"value":value})
edgeBuilding["target"] = edgeBuilding["target"] +"_"
sources_order = edgeBuilding.groupby("source").sum().sort_values("value")[::-1]
targets_order = edgeBuilding.groupby("target").sum().sort_values("value")[::-1]
allnodes = np.concatenate((sources_order.index.values,targets_order.index.values))
nodesSankey = hv.Dataset(pd.DataFrame({"buildings":allnodes}))
value_dim = hv.Dimension('value', unit='dev.')
betBuildings = hv.Sankey((edgeBuilding, nodesSankey),["source","target"],vdims=value_dim)
betBuildings.opts(
opts.Sankey(labels='buildings', label_position='right', width=900, height=500, cmap='Set1',
edge_color=dim('source').str(), node_color=dim('buildings').str()))
nodes,edges = [],[]
for aps in lmacs.values:
nodes=nodes+aps
c = 0
i = len(aps)
while c+1<i:
edges.append((aps[c],aps[c+1]))
c+=1
if c==0: edges.append((aps[c],aps[c]))
nodes = list(set(nodes)) #remove duplicate
# We get the list of buildings using the AP name
attr_building={}
for ap in nodes:
name = ap.split("-")[1]
attr_building[ap] = name
names_building = list(set(attr_building.values()))
#edges by building
eall = [(x.split("-")[1],y.split("-")[1]) for (x,y) in edges]
#Frequency of movements among buildings
attr_edges = dict(collections.Counter(eall))
print(list(attr_edges.items())[:5])
#Data preparation for Holoview model.
source,target,value=[],[],[]
for item in attr_edges.items():
source.append(item[0][0])
target.append(item[0][1])
value.append(item[1])
edgeBuilding = pd.DataFrame({"source":source,"target":target,"value":value})
nodeBuilding = hv.Dataset(pd.DataFrame({"building":names_building}))
chord = hv.Chord((edgeBuilding, nodeBuilding))
chord.opts(
opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source'),
labels='building', node_color=dim('building').str(),height=500,width=500))
names_building = sorted(names_building)
mapIdBuilding = dict(zip(names_building,range(len(names_building))))
freq = np.zeros((len(names_building),len(names_building)))
for b1,b2 in attr_edges:
freq[mapIdBuilding[b1],mapIdBuilding[b2]] = attr_edges[(b1,b2)]
fig, ax = plt.subplots()
c = ax.pcolor(freq, edgecolors='k', linewidths=4)
ax.set_title('User handover frequency in all buildings')
ax.set_xticks(np.arange(len(names_building)) + 0.5, minor=False)
ax.set_yticks(np.arange(len(names_building)) + 0.5, minor=False)
ax.set_xticklabels(names_building,rotation=90)
ax.set_yticklabels(names_building)
fig.tight_layout()
plt.show()
attr_building={}
for ap in nodes:
name = ap.split("-")[1]
attr_building[ap] = name
names_building = list(set(attr_building.values()))
#edges by building
eall = [(x.split("-")[1],y.split("-")[1]) for (x,y) in edges if x.split("-")[1]!=y.split("-")[1]]
attr_edges = dict(collections.Counter(eall))
source,target,value=[],[],[]
for item in attr_edges.items():
source.append(item[0][0])
target.append(item[0][1])
value.append(item[1])
edgeBuilding = pd.DataFrame({"source":source,"target":target,"value":value})
nodeBuilding = hv.Dataset(pd.DataFrame({"building":names_building}))
chord = hv.Chord((edgeBuilding, nodeBuilding))
chord.opts(
opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source'),
labels='building', node_color=dim('building').str(),height=600,width=600))
names_building = sorted(names_building)
mapIdBuilding = dict(zip(names_building,range(len(names_building))))
freq = np.zeros((len(names_building),len(names_building)))
for b1,b2 in attr_edges:
freq[mapIdBuilding[b1],mapIdBuilding[b2]] = attr_edges[(b1,b2)]
fig, ax = plt.subplots()
c = ax.pcolor(freq, edgecolors='k', linewidths=4)
ax.set_title('User handover frequency in all buildings (excluding self.buildings)')
ax.set_xticks(np.arange(len(names_building)) + 0.5, minor=False)
ax.set_yticks(np.arange(len(names_building)) + 0.5, minor=False)
ax.set_xticklabels(names_building,rotation=90)
ax.set_yticklabels(names_building)
fig.tight_layout()
plt.show()
#in lmacs, we have the handovers
print(lmacs[:3])
from collections import defaultdict
equivBuildings={"MO":"MO","GC":"GC","MA":"MA","RL":"RL","AT":"AT","JO":"JO","ITD":"ITD","RES":"RES","SCT":"SCT","IE":"IE","CEP":"MA","SL":"SL","CJ":"CJ","CTI":"CTI","PF":"AT","CL":"CL"}
buildingsCode = equivBuildings.keys()
def buildingCode(equivBuildings,apname):
return apname.split("-")[1],apname.split("-")[3]
mBuildings = defaultdict(list)
selfMBuild = defaultdict(int)
apsInBuild = defaultdict(set)
for aps in lmacs.values:
for pos in range(len(aps)):
build,apid = buildingCode(equivBuildings,aps[pos])
apsInBuild[equivBuildings[build]].add(apid)
if pos+1 < len(aps):
buildn,apidn = buildingCode(equivBuildings,aps[pos+1])
if equivBuildings[build]==equivBuildings[buildn] and apid != apidn:
mBuildings[equivBuildings[build]].append((apid,apidn))
else:
if pos == 1 or len(mBuildings[equivBuildings[build]])==0:
mBuildings[equivBuildings[build]].append((apid,apid))
selfMBuild[equivBuildings[build]]+=1
movs = np.array([len(mBuildings[k]) for k in mBuildings])
movsNotSelf = np.array([len(mBuildings[k])-selfMBuild[k] for k in mBuildings])
fig, ax = plt.subplots()
p0 = plt.bar(np.arange(len(movs)),movs-movsNotSelf)
p1 = plt.bar(np.arange(len(movs)),movsNotSelf,bottom=movs-movsNotSelf)
ax.set_title('Handovers in each building')
ax.set_xticks(np.arange(len(movs)), minor=False)
ax.set_xticklabels(mBuildings.keys(),rotation=90)
plt.legend((p0[0],p1[1]),("Only one AP","More APs"))
fig.tight_layout()
plt.show()
apsInBuildings = [len(apsInBuild[k]) for k in apsInBuild]
fig1, ax = plt.subplots()
ax.bar(np.arange(len(apsInBuildings)),apsInBuildings)
ax.set_xticks(np.arange(len(apsInBuild.keys())), minor=False)
ax.set_xticklabels(apsInBuild.keys(),rotation=90)
ax.set_ylabel('Total number of APs')
plt.show()
#Nice to see the rate between the number of APs in buildings vs the handovers. Building: size, design,...
#Handovers frequency
fig1, ax = plt.subplots()
for b in mBuildings.keys():
x = np.array(sorted(collections.Counter(mBuildings[b]).values(),reverse=True))
if len(x)>10:
ax.plot(x,label=b)
ax.set_title('Repetition of same handovers (APi -> APj)')
ax.set_xlabel('Types of transictions bet. APi -> APj')
ax.set_ylabel('Frequency')
ax.set_ylim(0,30)
ax.set_xlim(0,400)
plt.legend(ncol=4)
plt.show()
#AP frequency
from collections import Counter
freqAP = {}
bigBuild = 0
fig1, ax = plt.subplots()
for b in mBuildings.keys():
lsdap = list(zip(*mBuildings[b]))
x = (Counter(lsdap[0])+Counter(lsdap[1])).values()
if len(x)>bigBuild:
bigBuild=len(x)
freqAP[b] = np.array(sorted(x,reverse=True))
ax.plot(freqAP[b],label=b)
ax.set_title('Frequency of APs in handovers')
ax.set_xlabel('APs')
ax.set_ylabel('Frequency')
plt.legend(ncol=4)
plt.show()
# Normalized AP frequency in handovers.
fig1, ax = plt.subplots()
for b in mBuildings.keys():
normFreqAP = (freqAP[b] - np.min(freqAP[b]))/np.ptp(freqAP[b])
ax.plot(sorted(normFreqAP,reverse=True),label=b)
ax.set_title('normalized frequency APs in handovers')
ax.set_xlabel('APs')
ax.set_ylabel('Frequency')
ax.axhline(y=0.4, linestyle='dashed', alpha=0.5,color='#004f6d')
ax.text(x=40, y=0.4+0.05, s="40%", alpha=0.7, color='#004f6d')
plt.legend(ncol=4)
plt.show()
dfAps = pd.DataFrame()
for b in mBuildings.keys():
dfAps[b] = np.concatenate((freqAP[b],([np.nan]*(bigBuild-len(freqAP[b])))))
dfAps = dfAps.T
fig1, ax = plt.subplots()
dfAps.mean().plot()
ax.set_title('Average frequency of APs in handovers')
ax.set_xlabel('APs')
ax.set_ylabel('Average Frequency')
plt.show()
df = pd.read_csv("proximity.txt"," ")
df = df.rename(columns={"hashed_sta_eth_mac": "hashedmac"})
df = df.sort_values(by=['hashedmac', 'date_time'],ascending=True)
df['date_time'] = pd.to_datetime(df['date_time'])
#Grouping same hashed macs by time difference as a new users
df['diff'] = df['date_time'].sub(df.groupby('hashedmac')['date_time'].shift()).dt.total_seconds()/60 #Xisco code ;)
threshold = 10 #NOTE: TODO how to measure the effect of this threshold in the results...
df["id"] = ((df['diff']>threshold) | (df['hashedmac'] != df['hashedmac'].shift())).cumsum()
#Clean the dataset
df.reset_index(inplace=True)
df = df.drop(columns=['date_time',"json_name","apiDateTime"])
df = df.drop(columns=["hashedmac","index","diff"])
df = df.drop(columns=['radio_mac_addr', 'rssi_val', 'target_type', 'ts'])
print(df.tail())
lmacs = df.groupby("id")["ap_name"].apply(list)
lent = list(map(lambda x:len(x),lmacs))
df = pd.DataFrame(zip(lmacs,lent),columns=["aps","length"])
nmoves = np.sum(df.length>1)
rtmoves = nmoves/df.shape[0]
print("Percentatge of connections with more than one APs: %.3f%%"%(rtmoves*100))
#Number of different hashed macs in total
(unique, counts) = np.unique(df.length, return_counts=True)
assert np.sum(counts[1:])==nmoves
fig, ax = plt.subplots()
ax.bar(unique,counts)
ax.set_title('User handover frequency in all buildings (one week)')
plt.show()
equivAP={"MO":"MO","GC":"GC","MA":"MA","RL":"RL","AT":"AT","JO":"JO","ITD":"ITD","RES":"RES","SCT":"SCT","IE":"IE","CEP":"MA","SL":"SL","CJ":"CJ","CTI":"CTI","PF":"AT","CL":"CL"}
def getNames(x):
return [equivAP[i.split("-")[1]] for i in x]
df["builds"] = list(map(getNames,df.aps))
#More than one handover with a list of ID buildings.
df2 = df[df.length>1]
df2["diffBuilds"] = list(map(lambda x:len(set(x)),df2.builds))
(unique, counts) = np.unique(df2.diffBuilds, return_counts=True)
fig, ax = plt.subplots()
ax.bar(unique[1:],counts[1:])
ax.set_title('User handover frequency among different buildings (one week)')
ax.set_xlabel('number of buildings')
ax.set_ylabel('users')
plt.show()
nmovediff = np.sum(counts[1:])
rtmoves = nmovediff/df.shape[0]
print("Percentatge of connections with more than one APs among diff. buildings: %.3f%%"%(rtmoves*100))
edges = []
nodes = set()
for aps in df2[df2.diffBuilds>1].aps:
aps = list(set(aps))
for ap1,ap2 in zip(aps,aps[1:]):
b1 = ap1.split("-")[1]
b2 = ap2.split("-")[1]
if b1!=b2:
edges.append((b1,b2))
nodes.add(b1)
nodes.add(b2)
nodes = list(nodes)
attr_edges = dict(collections.Counter(edges))
source,target,value=[],[],[]
for item in attr_edges.items():
source.append(item[0][0])
target.append(item[0][1])
value.append(item[1])
edgeBuilding = pd.DataFrame({"source":source,"target":target,"value":value})
edgeBuilding["target"] = edgeBuilding["target"] +"_"
sources_order = edgeBuilding.groupby("source").sum().sort_values("value")[::-1]
targets_order = edgeBuilding.groupby("target").sum().sort_values("value")[::-1]
allnodes = np.concatenate((sources_order.index.values,targets_order.index.values))
nodesSankey = hv.Dataset(pd.DataFrame({"buildings":allnodes}))
value_dim = hv.Dimension('value', unit='dev.')
betBuildings = hv.Sankey((edgeBuilding, nodesSankey),["source","target"],vdims=value_dim)
betBuildings.opts(
opts.Sankey(labels='buildings', label_position='right', width=900, height=500, cmap='Set1',
edge_color=dim('source').str(), node_color=dim('buildings').str()))
edgeBuilding = pd.DataFrame({"source":source,"target":target,"value":value})
nodeBuilding = hv.Dataset(pd.DataFrame({"building":allnodes}))
chord = hv.Chord((edgeBuilding, nodeBuilding))
chord.opts(
opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source'),
labels='building', node_color=dim('building').str(),height=600,width=600))
nodes = sorted(nodes)
mapIdBuilding = dict(zip(nodes,range(len(nodes))))
freq = np.zeros((len(nodes),len(nodes)))
for b1,b2 in attr_edges:
freq[mapIdBuilding[b1],mapIdBuilding[b2]] = attr_edges[(b1,b2)]
fig, ax = plt.subplots()
c = ax.pcolor(freq, edgecolors='k', linewidths=4)
ax.set_title('User handover frequency in all buildings (excluding self.buildings)')
ax.set_xticks(np.arange(len(nodes)) + 0.5, minor=False)
ax.set_yticks(np.arange(len(nodes)) + 0.5, minor=False)
ax.set_xticklabels(nodes,rotation=90)
ax.set_yticklabels(nodes)
fig.tight_layout()
plt.show()
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import STL
df = pd.read_csv("proximity.txt"," ")
df = df.rename(columns={"hashed_sta_eth_mac": "hashedmac"})
df = df.sort_values(by=['hashedmac', 'date_time'],ascending=True)
df['date_time'] = pd.to_datetime(df['date_time']) # convertimos a datetime
df.index = df.date_time
df["one"] = np.ones(df.shape[0])
days = set(df.index.day)
nplots = len(set(days))
cols = 7
rows = nplots//cols+1
fig, axs = plt.subplots(rows,cols,figsize=(25,15))
row = 0
for p,d in enumerate(days):
day = pd.to_datetime("10-%i-2020"%d)
cold = day.weekday()
if cold == 0: row +=1 #take care of the first element
dm = df.loc["10-%i-2020"%d].one.resample('5min').sum()
dm.plot(ax=axs[row,cold])
axs[row,cold].title.set_text("10-%i-2020"%d)
axs[row,cold].set_ylim(0,5000)
fig.suptitle("Freq. of handovers in AP in 5minutes period", fontsize=20)
ds = df.loc["10-05-2020":"10-09-2020"].one.resample('5min').sum()
ds = ds.asfreq('5T')
res = STL(ds, robust = True,period=5).fit()
res.plot()
plt.show()
# =============================================================================
# #The AP with more handovers
# =============================================================================
# g = df.groupby(["ap_name"]).agg({"one":np.sum})
# print(g[g.one==g.one.max()])
ap = "AP-IE-IE-W06"
dfa = df[df.ap_name==ap]
fig, axs = plt.subplots(rows,cols,figsize=(25,15))
row = 0
for p,d in enumerate(days):
day = pd.to_datetime("10-%i-2020"%d)
cold = day.weekday()
if cold == 0: row +=1 #take care of the first element
dm = dfa.loc["10-%i-2020"%d].one.resample('5min').sum()
dm.plot(ax=axs[row,cold])
axs[row,cold].title.set_text("10-%i-2020"%d)
axs[row,cold].set_ylim(0,70)
fig.suptitle("Handovers in AP %s"%ap, fontsize=30)
# A lot of noise in the AP on weekends
# sunday: more freq. is AP-IE-IE-W06 2292.0 !!
g = df.loc["10-04-2020"].groupby(["ap_name"]).agg({"one":np.sum})
print(g[g.one==g.one.max()])
# =============================================================================
# Use of APs on weedays and weekends (only one sunday)
# =============================================================================
g = df.groupby(["ap_name"]).agg({"one":np.sum})
g = g.sort_index()
g.one.plot()
plt.title("AP freq. handover", fontsize=20)
gw = df.loc["10-04-2020"].groupby(["ap_name"]).agg({"one":np.sum})
gw = gw.sort_index()
dg = pd.concat([g,gw],axis=1)
dg.columns =["all","sunday"]
dg.sunday = dg.sunday*10 #MAGIC NUMBER!
dg.plot()
plt.title("AP freq. handover and APs freq on sunday *10", fontsize=20)
# =============================================================================
# AP without handovers on weekend (note: only using one sunday)
# =============================================================================
dg2 = dg[dg.sunday.isnull()]
print(dg2[dg2["all"]==dg2["all"].max()])
ap = "AP-RL-PPAL-W30"
dfa = df[df.ap_name==ap]
fig, axs = plt.subplots(rows,cols,figsize=(25,15))
row = 0
for p,d in enumerate(days):
day = pd.to_datetime("10-%i-2020"%d)
cold = day.weekday()
if cold == 0: row +=1 #take care of the first element
dm = dfa.loc["10-%i-2020"%d].one.resample('5min').sum()
if len(dm)>0:
dm.plot(ax=axs[row,cold])
axs[row,cold].title.set_text("10-%i-2020"%d)
axs[row,cold].set_ylim(0,70)
fig.suptitle("Handovers in AP %s"%ap, fontsize=30)