initialize qrs for dsd with old paper sources

This commit is contained in:
Arthur Grisel-Davy 2023-06-28 16:00:55 -04:00
parent 68afe79049
commit 5447343472
12 changed files with 2514 additions and 0 deletions

14
DSD/qrs/acronyms.tex Normal file
View file

@ -0,0 +1,14 @@
\newabbreviation{tas}{TAS}{Temporal Action Segmentation}
\newabbreviation{dsd}{DSD}{Device State Detector}
\newabbreviation{cpd}{CPD}{Change Point Detection}
\newabbreviation{hids}{HIDS}{Host-Based Intrusion Detection Software}
\newabbreviation{nids}{NIDS}{Network-Based Intrusion Detection Software}
\newabbreviation{1nn}{1-NN}{1-Nearest Neighbor}
\newabbreviation{knn}{K-NN}{K-Nearest Neighbor}
\newabbreviation{rnn}{RNN}{Recurrent Neural Network}
\newabbreviation{cnn}{CNN}{Convolutional Neural Network}
\newabbreviation{svm}{SVM}{Support Vector Classifier}
\newabbreviation{mlp}{MLP}{Multi Layer Perceptron}
\newabbreviation{mad}{MAD}{Machine Activity Detector}
\newabbreviation{ids}{IDS}{Intrusion Detection Systems}
\newabbreviation{nilm}{NILM}{Nonintrusive Load Monitoring}

611
DSD/qrs/biblio.bib Normal file
View file

@ -0,0 +1,611 @@
@inproceedings{deldari2020espresso,
title={Entropy and ShaPe awaRe timE-Series SegmentatiOn for processing heterogeneous sensor data},
author={Deldari, Shohreh and Smith, Daniel V. and Sadri, Amin and Salim, Flora D. },
journal={Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)},
volume={4},
number={3},
articleno={77},
year={2020},
url = {https://doi.org/10.1145/3411832},
doi = {10.1145/3411832}
}
@inproceedings{10.1145/3081333.3081340,
author = {Virmani, Aditya and Shahzad, Muhammad},
title = {Position and Orientation Agnostic Gesture Recognition Using WiFi},
year = {2017},
isbn = {9781450349284},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3081333.3081340},
doi = {10.1145/3081333.3081340},
abstract = {WiFi based gesture recognition systems have recently proliferated due to the ubiquitous availability of WiFi in almost every modern building. The key limitation of existing WiFi based gesture recognition systems is that they require the user to be in the same configuration (i.e., at the same position and in same orientation) when performing gestures at runtime as when providing training samples, which significantly restricts their practical usability. In this paper, we propose a WiFi based gesture recognition system, namely WiAG, which recognizes the gestures of the user irrespective of his/her configuration. The key idea behind WiAG is that it first requests the user to provide training samples for all gestures in only one configuration and then automatically generates virtual samples for all gestures in all possible configurations by applying our novel translation function on the training samples. Next, for each configuration, it generates a classification model using virtual samples corresponding to that configuration. To recognize gestures of a user at runtime, as soon as the user performs a gesture, WiAG first automatically estimates the configuration of the user and then evaluates the gesture against the classification model corresponding to that estimated configuration. Our evaluation results show that when user's configuration is not the same at runtime as at the time of providing training samples, WiAG significantly improves the gesture recognition accuracy from just 51.4\% to 91.4\%.},
booktitle = {Proceedings of the 15th Annual International Conference on Mobile Systems, Applications, and Services},
pages = {252264},
numpages = {13},
keywords = {agnostic, position, orientation, WiFi, gesture recognition},
location = {Niagara Falls, New York, USA},
series = {MobiSys '17}
}
@article{aminikhanghahi2018real,
title={Real-time change point detection with application to smart home time series data},
author={Aminikhanghahi, Samaneh and Wang, Tinghui and Cook, Diane J},
journal={IEEE Transactions on Knowledge and Data Engineering},
volume={31},
number={5},
pages={1010--1023},
year={2018},
publisher={IEEE}
}
%Fancourt, C.L., Principe, J.C., 1996. A neighborhood map of competing one step predictors for piecewise segmentation and identification of time series. In: Proceedings of the International Conference on Neural Network, vol. 4, pp. 19061911.
@article{xiao2022self,
title={Self-Supervised Few-Shot Time-series Segmentation for Activity Recognition},
author={Xiao, Chunjing and Chen, Shiming and Zhou, Fan and Wu, Jie},
journal={IEEE Transactions on Mobile Computing},
year={2022},
publisher={IEEE}
}
@misc{2207.09925,
doi = {10.48550/ARXIV.2207.09925},
url = {https://arxiv.org/abs/2207.09925},
author = {Xu, Leiyang and Wang, Qiang and Lin, Xiaotian and Yuan, Lin},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {An Efficient Framework for Few-shot Skeleton-based Temporal Action Segmentation},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@article{sarker2018individualized,
title={Individualized time-series segmentation for mining mobile phone user behavior},
author={Sarker, Iqbal H and Colman, Alan and Kabir, Muhammad Ashad and Han, Jun},
journal={The Computer Journal},
volume={61},
number={3},
pages={349--368},
year={2018},
publisher={Oxford University Press}
}
@article{4445667, author={Liu, Xiaoyan and Lin, Zhenjiang and Wang, Huaiqing}, journal={IEEE Transactions on Knowledge and Data Engineering}, title={Novel Online Methods for Time Series Segmentation}, year={2008}, volume={20}, number={12}, pages={1616-1626}, doi={10.1109/TKDE.2008.29}}
@article{4160958, author={Yujian, Li and Bo, Liu}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, title={A Normalized Levenshtein Distance Metric}, year={2007}, volume={29}, number={6}, pages={1091-1095}, doi={10.1109/TPAMI.2007.1078}}
@article{aminikhanghahi2017survey,
title={A survey of methods for time series change point detection},
author={Aminikhanghahi, Samaneh and Cook, Diane J},
journal={Knowledge and information systems},
volume={51},
number={2},
pages={339--367},
year={2017},
publisher={Springer}
}
@misc{palitronica,
title = {Palitronica - Palisade},
howpublished = {\url{https://www.palitronica.com/products/palisade}},
note = {Accessed: 2010-03-26}
}
@inbook{278e1df91d22494f9be2adfca2559f92,
title = "A data management platform for personalised real-time energy feedback",
keywords = "smart homes, real-time energy, smart energy meter, energy consumption, Electrical engineering. Electronics Nuclear engineering, Electrical and Electronic Engineering",
author = "David Murray and Jing Liao and Lina Stankovic and Vladimir Stankovic and Richard Hauxwell-Baldwin and Charlie Wilson and Michael Coleman and Tom Kane and Steven Firth",
year = "2015",
booktitle = "Proceedings of the 8th International Conference on Energy Efficiency in Domestic Appliances and Lighting",
}
@Article{Hunter:2007,
Author = {Hunter, J. D.},
Title = {Matplotlib: A 2D graphics environment},
Journal = {Computing in Science \& Engineering},
Volume = {9},
Number = {3},
Pages = {90--95},
abstract = {Matplotlib is a 2D graphics package used for Python for
application development, interactive scripting, and publication-quality
image generation across user interfaces and operating systems.},
publisher = {IEEE COMPUTER SOC},
doi = {10.1109/MCSE.2007.55},
year = 2007
}
@inproceedings{kocher1996timing,
title={Timing attacks on implementations of Diffie-Hellman, RSA, DSS, and other systems},
author={Kocher, Paul C},
booktitle={Advances in Cryptology—CRYPTO96: 16th Annual International Cryptology Conference Santa Barbara, California, USA August 18--22, 1996 Proceedings 16},
pages={104--113},
year={1996},
organization={Springer}
}
@article{villalobos2021flexible,
title={A flexible alarm prediction system for smart manufacturing scenarios following a forecaster--analyzer approach},
author={Villalobos, Kevin and Suykens, Johan and Illarramendi, Arantza},
journal={Journal of Intelligent Manufacturing},
volume={32},
pages={1323--1344},
year={2021},
publisher={Springer}
}
@article{belikovetsky2018digital,
title={Digital audio signature for 3D printing integrity},
author={Belikovetsky, Sofia and Solewicz, Yosef A and Yampolskiy, Mark and Toh, Jinghui and Elovici, Yuval},
journal={IEEE Transactions on Information Forensics and Security},
volume={14},
number={5},
pages={1127--1141},
year={2018},
publisher={IEEE}
}
@article{al2016forensics,
title={Forensics of thermal side-channel in additive manufacturing systems},
author={Al Faruque, Mohammad Abdullah and Chhetri, Sujit Rokka and Canedo, A and Wan, J},
journal={University of California, Irvine},
volume={12},
number={13},
pages={176},
year={2016}
}
@article{10.1145/3571288,
author = {Thakur, Shailja and Moreno, Carlos and Fischmeister, Sebastian},
title = {CANOA: CAN Origin Authentication Through Power Side-Channel Monitoring},
year = {2022},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
issn = {2378-962X},
url = {https://doi.org/10.1145/3571288},
doi = {10.1145/3571288},
abstract = {The lack of any sender authentication mechanism in place makes Controller Area Network (CAN) vulnerable to security threats. For instance, an attacker can impersonate an Electronic Control Unit (ECU) on the bus and send spoofed messages unobtrusively with the identifier of the impersonated ECU. To address this problem, we propose a novel source authentication technique that uses power consumption measurements of the ECU to authenticate the source of a message. A transmission of an ECU affects the power consumption and a characteristic pattern will appear. Our technique exploits the power consumption of each ECU during the transmission of a message to determine whether the message actually originated from the purported sender. We evaluate our approach in both a lab setup and a real vehicle. We also evaluate our approach against factors that can impact the power consumption measurement of the ECU. The results of the evaluation show that the proposed technique is applicable in a broad range of operating conditions with reasonable computational power requirements and attaining good accuracy.},
note = {Just Accepted},
journal = {ACM Trans. Cyber-Phys. Syst.},
month = {nov},
keywords = {CAN, transmissions, authentication, automotive security}
}
@article{gatlin2019detecting,
title={Detecting sabotage attacks in additive manufacturing using actuator power signatures},
author={Gatlin, Jacob and Belikovetsky, Sofia and Moore, Samuel B and Solewicz, Yosef and Elovici, Yuval and Yampolskiy, Mark},
journal={IEEE Access},
volume={7},
pages={133421--133432},
year={2019},
publisher={IEEE}
}
@article{CHOU2014400,
title = {Real-time detection of anomalous power consumption},
journal = {Renewable and Sustainable Energy Reviews},
volume = {33},
pages = {400-411},
year = {2014},
issn = {1364-0321},
doi = {https://doi.org/10.1016/j.rser.2014.01.088},
url = {https://www.sciencedirect.com/science/article/pii/S1364032114001142},
author = {Jui-Sheng Chou and Abdi Suryadinata Telaga},
keywords = {Power consumption, Big data analytics, Anomaly detection, Pattern recognition, Real time detection, Time series prediction},
abstract = {Effective feedback can reduce building power consumption and carbon emissions. Therefore, providing information to building managers and tenants is the first step in identifying ways to reduce power consumption. Since reducing anomalous consumption can have a large impact, this study proposes a novel approach to using large sets of data for a building space to identify anomalous power consumption. This method identifies anomalies in two stages: consumption prediction and anomaly detection. Daily real-time consumption is predicted by using a hybrid neural net ARIMA (auto-regressive integrated moving average) model of daily consumption. Anomalies are then identified by differences between real and predicted consumption by applying the two-sigma rule. The experimental results for a 17-week study of electricity consumption in a building office space confirm that the method can detect anomalous values in real time. Another contribution of the study is the development of a formalized methodology for detecting anomalous patterns in large data sets for real-time of building office space energy consumption. Moreover, the prediction component can be used to plan electricity usage while the anomaly detection component can be used to understand the energy consumption behaviors of tenants.}
}
@INPROCEEDINGS{9934955,
author={Grisel-Davy, Arthur and Bhogayata, Amrita Milan and Pabbi, Srijan and Narayan, Apurva and Fischmeister, Sebastian},
booktitle={2022 International Conference on Embedded Software (EMSOFT)},
title={Work-in-Progress: Boot Sequence Integrity Verification with Power Analysis},
year={2022},
volume={},
number={},
pages={3-4},
doi={10.1109/EMSOFT55006.2022.00009}}
@INPROCEEDINGS{9061783,
author={Li, Yanjie and He, Ruiwen and Ji, Xiaoyu and Xu, Wenyuan},
booktitle={2019 IEEE 3rd Conference on Energy Internet and Energy System Integration (EI2)},
title={Using power side-channel to implement anomaly-based intrusion detection on smart grid terminals},
year={2019},
volume={},
number={},
pages={2669-2674},
doi={10.1109/EI247390.2019.9061783}}
@article{ilgun1995state,
title={State transition analysis: A rule-based intrusion detection approach},
author={Ilgun, Koral and Kemmerer, Richard A and Porras, Phillip A},
journal={IEEE transactions on software engineering},
volume={21},
number={3},
pages={181--199},
year={1995},
publisher={IEEE}
}
@INPROCEEDINGS{5563714,
author={Lei Li and De-Zhang Yang and Fang-Cheng Shen},
booktitle={2010 3rd International Conference on Computer Science and Information Technology},
title={A novel rule-based Intrusion Detection System using data mining},
year={2010},
volume={6},
number={},
pages={169-172},
doi={10.1109/ICCSIT.2010.5563714}}
@article{kumar2020integrated,
title={An integrated rule based intrusion detection system: analysis on UNSW-NB15 data set and the real time online dataset},
author={Kumar, Vikash and Sinha, Ditipriya and Das, Ayan Kumar and Pandey, Subhash Chandra and Goswami, Radha Tamal},
journal={Cluster Computing},
volume={23},
pages={1397--1418},
year={2020},
publisher={Springer}
}
@article{uddin2018activity,
title={Activity recognition for cognitive assistance using body sensors data and deep convolutional neural network},
author={Uddin, Md Zia and Hassan, Mohammad Mehedi},
journal={IEEE Sensors Journal},
volume={19},
number={19},
pages={8413--8419},
year={2018},
publisher={IEEE}
}
@article{wannenburg2016physical,
title={Physical activity recognition from smartphone accelerometer data for user context awareness sensing},
author={Wannenburg, Johan and Malekian, Reza},
journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
volume={47},
number={12},
pages={3142--3149},
year={2016},
publisher={IEEE}
}
@inproceedings{bodor2003vision,
title={Vision-based human tracking and activity recognition},
author={Bodor, Robert and Jackson, Bennett and Papanikolopoulos, Nikolaos},
booktitle={Proc. of the 11th Mediterranean Conf. on Control and Automation},
volume={1},
pages={1--6},
year={2003},
organization={Citeseer}
}
@article{zhang2019numerical,
title={Numerical delineation of 3D unsteady flow fields in side channel pumps for engineering processes},
author={Zhang, Fan and Chen, Ke and Appiah, Desmond and Hu, Bo and Yuan, Shouqi and Asomani, Stephen Ntiri},
journal={Energies},
volume={12},
number={7},
pages={1287},
year={2019},
publisher={MDPI}
}
@INPROCEEDINGS{4393062,
author={Zhou, Wei and Habetler, Thomas G. and Harley, Ronald G.},
booktitle={2007 IEEE International Symposium on Diagnostics for Electric Machines, Power Electronics and Drives},
title={Bearing Condition Monitoring Methods for Electric Machines: A General Review},
year={2007},
volume={},
number={},
pages={3-6},
doi={10.1109/DEMPED.2007.4393062}}
@article{yang2016power,
title={Power consumption based android malware detection},
author={Yang, Hongyu and Tang, Ruiwen},
journal={Journal of Electrical and Computer Engineering},
volume={2016},
year={2016},
publisher={Hindawi}
}
@article{chawla2021machine,
title={Machine learning in wavelet domain for electromagnetic emission based malware analysis},
author={Chawla, Nikhil and Kumar, Harshit and Mukhopadhyay, Saibal},
journal={IEEE Transactions on Information Forensics and Security},
volume={16},
pages={3426--3441},
year={2021},
publisher={IEEE}
}
@article{wang2015measurement,
title={Measurement system of gear parameters based on machine vision},
author={Wang, Wencheng and Guan, Fengnian and Ma, Shiyong and Li, Jian},
journal={Measurement and Control},
volume={48},
number={8},
pages={242--248},
year={2015},
publisher={SAGE Publications Sage UK: London, England}
}
@ARTICLE{1702202,
author={Denning, D.E.},
journal={IEEE Transactions on Software Engineering},
title={An Intrusion-Detection Model},
year={1987},
volume={SE-13},
number={2},
pages={222-232},
doi={10.1109/TSE.1987.232894}}
@INPROCEEDINGS{9491765,
author={Alsmadi, Tibra and Alqudah, Nour},
booktitle={2021 International Conference on Information Technology (ICIT)},
title={A Survey on malware detection techniques},
year={2021},
volume={},
number={},
pages={371-376},
doi={10.1109/ICIT52682.2021.9491765}}
@inproceedings{10.1145/2940343.2940348,
author = {Malik, Jyoti and Kaushal, Rishabh},
title = {CREDROID: Android Malware Detection by Network Traffic Analysis},
year = {2016},
isbn = {9781450343466},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2940343.2940348},
doi = {10.1145/2940343.2940348},
abstract = {Android, one of the most popular open source mobile operating system, is facing a lot of security issues. Being used by users with varying degrees of awareness complicates the problem further. Most of the security problems are due to maliciousness of android applications. The malwares get installed in mobile phones through various popular applications particularly gaming applications or some utility applications from various third party app-stores which are untrustworthy. A common feature of the malware is to access the sensitive information from the mobile device and transfer it to remote servers. For our work, we have confined ourselves to defining maliciousness as leakage of privacy information by Android application. In this paper we have proposed a method named as CREDROID which identifies malicious applications on the basis of their Domain Name Server(DNS) queries as well as the data it transmits to remote server by performing the in-depth analysis of network traffic logs in offline mode. Instead of performing signature based detection which is unable to detect polymorphic malwares, we propose a pattern based detection. Pattern in our work refers to the leakage of sensitive information being sent to the remote server. CREDROID is a semi-automated approach which works on various factors like the remote server where the application is connecting, data being sent and the protocol being used for communication for identifying the trustworthiness (credibility) of the application. In our work, we have observed that 63% of the applications from a standard dataset of malwares are generating network traffic which has been the focus of our work.},
booktitle = {Proceedings of the 1st ACM Workshop on Privacy-Aware Mobile Computing},
pages = {2836},
numpages = {9},
keywords = {Android, malware detection, network traffic analysis},
location = {Paderborn, Germany},
series = {PAMCO '16}
}
}
@article{jelali2013statistical,
title={Statistical process control},
author={Jelali, Mohieddine and Jelali, Mohieddine},
journal={Control Performance Management in Industrial Automation: Assessment, Diagnosis and Improvement of Control Loop Performance},
pages={209--217},
year={2013},
publisher={Springer}
}
@inproceedings{tongaonkar2007inferring,
title={Inferring Higher Level Policies from Firewall Rules.},
author={Tongaonkar, Alok and Inamdar, Niranjan and Sekar, R},
booktitle={LISA},
volume={7},
pages={1--10},
year={2007}
}
@article{aly2005survey,
title={Survey on multiclass classification methods},
author={Aly, Mohamed},
journal={Neural Netw},
volume={19},
number={1-9},
pages={2},
year={2005},
publisher={Citeseer}
}
@misc{grandini2020metrics,
title={Metrics for Multi-Class Classification: an Overview},
author={Margherita Grandini and Enrico Bagli and Giorgio Visani},
year={2020},
eprint={2008.05756},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{zenodo,
title={Evaluation Dataset for the Machine State Detector, \url{https://zenodo.org/record/7782702#.ZCR33byZNhE}},
year={2023},
}
@article{gupta2021novel,
title={A novel failure mode effect and criticality analysis (FMECA) using fuzzy rule-based method: A case study of industrial centrifugal pump},
author={Gupta, Gajanand and Ghasemian, Hamed and Janvekar, Ayub Ahmed},
journal={Engineering Failure Analysis},
volume={123},
pages={105305},
year={2021},
publisher={Elsevier}
}
@inproceedings{10.1145/2976749.2978353,
author = {Genkin, Daniel and Pachmanov, Lev and Pipman, Itamar and Tromer, Eran and Yarom, Yuval},
title = {ECDSA Key Extraction from Mobile Devices via Nonintrusive Physical Side Channels},
year = {2016},
isbn = {9781450341394},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2976749.2978353},
doi = {10.1145/2976749.2978353},
abstract = {We show that elliptic-curve cryptography implementations on mobile devices are vulnerable to electromagnetic and power side-channel attacks. We demonstrate full extraction of ECDSA secret signing keys from OpenSSL and CoreBitcoin running on iOS devices, and partial key leakage from OpenSSL running on Android and from iOS's CommonCrypto. These non-intrusive attacks use a simple magnetic probe placed in proximity to the device, or a power probe on the phone's USB cable. They use a bandwidth of merely a few hundred kHz, and can be performed cheaply using an audio card and an improvised magnetic probe.},
booktitle = {Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security},
pages = {16261638},
numpages = {13},
keywords = {elliptic curve, side channel attack, electromagnetic analysis, power analysis},
location = {Vienna, Austria},
series = {CCS '16}
}
@article{randolph2020power,
title={Power side-channel attack analysis: A review of 20 years of study for the layman},
author={Randolph, Mark and Diehl, William},
journal={Cryptography},
volume={4},
number={2},
pages={15},
year={2020},
publisher={MDPI}
}
@article{micucci2017unimib,
title={Unimib shar: A dataset for human activity recognition using acceleration data from smartphones},
author={Micucci, Daniela and Mobilio, Marco and Napoletano, Paolo},
journal={Applied Sciences},
volume={7},
number={10},
pages={1101},
year={2017},
publisher={Multidisciplinary Digital Publishing Institute}
}
@article{truong2020selective,
title={Selective review of offline change point detection methods},
author={Truong, Charles and Oudre, Laurent and Vayatis, Nicolas},
journal={Signal Processing},
volume={167},
pages={107299},
year={2020},
publisher={Elsevier}
}
@inproceedings{10.1145/3371158.3371162,
author = {Narwariya, Jyoti and Malhotra, Pankaj and Vig, Lovekesh and Shroff, Gautam and Vishnu, T. V.},
title = {Meta-Learning for Few-Shot Time Series Classification},
year = {2020},
isbn = {9781450377386},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3371158.3371162},
doi = {10.1145/3371158.3371162},
abstract = {Deep neural networks (DNNs) have achieved state-of-the-art results on time series classification (TSC) tasks. In this work, we focus on leveraging DNNs in the often-encountered practical scenario where access to labeled training data is difficult, and where DNNs would be prone to overfitting. We leverage recent advancements in gradient-based meta-learning, and propose an approach to train a residual neural network with convolutional layers as a meta-learning agent for few-shot TSC. The network is trained on a diverse set of few-shot tasks sampled from various domains (e.g. healthcare, activity recognition, etc.) such that it can solve a target task from another domain using only a small number of training samples from the target task. Most existing meta-learning approaches are limited in practice as they assume a fixed number of target classes across tasks. We overcome this limitation in order to train a common agent across domains with each domain having different number of target classes, we utilize a triplet-loss based learning procedure that does not require any constraints to be enforced on the number of classes for the few-shot TSC tasks. To the best of our knowledge, we are the first to use meta-learning based pre-training for TSC. Our approach sets a new benchmark for few-shot TSC, outperforming several strong baselines on few-shot tasks sampled from 41 datasets in UCR TSC Archive. We observe that pre-training under the meta-learning paradigm allows the network to quickly adapt to new unseen tasks with small number of labeled instances.},
booktitle = {Proceedings of the 7th ACM IKDD CoDS and 25th COMAD},
pages = {2836},
numpages = {9},
keywords = {Time Series Classification, Meta-Learning, Few-Shot Learning, Convolutional Neural Networks},
location = {Hyderabad, India},
series = {CoDS COMAD 2020}
}
@article{tang2019few,
title={Few-shot time-series classification with dual interpretability},
author={Tang, Wensi and Liu, Lu and Long, Guodong},
journal={Space},
volume={2},
number={T1},
pages={T1},
year={2019}
}
@INPROCEEDINGS{9647357,
author={Gupta, Priyanka and Bhaskarpandit, Sathvik and Gupta, Manik},
booktitle={2021 Digital Image Computing: Techniques and Applications (DICTA)},
title={Similarity Learning based Few Shot Learning for ECG Time Series Classification},
year={2021},
volume={},
number={},
pages={1-8},
doi={10.1109/DICTA52665.2021.9647357}}
@article{duin1997experiments,
title={Experiments with a featureless approach to pattern recognition},
author={Duin, Robert PW and de Ridder, Dick and Tax, David MJ},
journal={Pattern Recognition Letters},
volume={18},
number={11-13},
pages={1159--1166},
year={1997},
publisher={Elsevier}
}
@INPROCEEDINGS{8598355,
author={Dash, Prajna and Naik, Kshirasagar},
booktitle={2018 IEEE Electrical Power and Energy Conference (EPEC)},
title={A Very Deep One Dimensional Convolutional Neural Network (VDOCNN) for Appliance Power Signature Classification},
year={2018},
volume={},
number={},
pages={1-6},
doi={10.1109/EPEC.2018.8598355}}
@article{angelis2022nilm,
title={NILM applications: Literature review of learning approaches, recent developments and challenges},
author={Angelis, Georgios-Fotios and Timplalexis, Christos and Krinidis, Stelios and Ioannidis, Dimosthenis and Tzovaras, Dimitrios},
journal={Energy and Buildings},
pages={111951},
year={2022},
publisher={Elsevier}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 306 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 144 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 144 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

1218
DSD/qrs/llncs.cls Normal file

File diff suppressed because it is too large Load diff

652
DSD/qrs/main.tex Normal file
View file

@ -0,0 +1,652 @@
% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.20 of 2017/10/04
%
% Updates from conference version
% - Proof of monotonicity of the number of unknown samples as a function of alpha (Sec: Influence of alpha)
% - Added a figure to illustrate the areas of capture as a function of alpha (Fig fig:areas)
\documentclass[sigconf, review, anonymous]{acmart}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage[toc,acronym,abbreviations,nonumberlist,nogroupskip]{glossaries-extra}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{tabularx}
\usepackage{algpseudocodex}
\usepackage{algorithm}
\hyphenation{dif-fe-rent}
\hyphenation{mo-di-fi-ca-tion}
\hyphenation{ope-ra-tions}
\hyphenation{acqui-ring}
\hyphenation{in-vo-lun-tary}
\hyphenation{re-le-vant}
\hyphenation{re-pre-sents}
\hyphenation{na-tu-ral-ly}
\hyphenation{col-lec-ting}
\hyphenation{sta-bi-li-ty}
\hyphenation{li-ne-ar}
\hyphenation{Figure}
\newtheorem{problem-statement}{Problem Statement}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following line
% to display URLs in blue roman font according to Springer's eBook style:
% \renewcommand\UrlFont{\color{blue}\rmfamily}
\newcommand\agd[1]{{\color{red}$\bigstar$}\footnote{agd: #1}}
\newcommand\SF[1]{{\color{blue}$\bigstar$}\footnote{sf: #1}}
\newcommand{\cn}{{\color{purple}[citation needed]}}
\newcommand{\pv}{{\color{orange}[passive voice]}}
\newcommand{\wv}{{\color{orange}[weak verb]}}
\citestyle{acmauthoryear}
%\renewcommand{\baselinestretch}{1.05}
\begin{document}
\input{acronyms}
%
\title{MAD: One-Shot Machine Activity Detector\\ for Physics-Based Cyber Security}
\author{Arthur Grisel-Davy}
\orcid{0000-0001-9293-035X}
\affiliation{%
\institution{University of Waterloo}
\city{Waterloo}
\state{Ontario}
\country{Canada}
}
\email{agriseld@uwaterloo.ca}
\author{Sebastian Fischmeister}
\affiliation{%
\institution{University of Waterloo}
\city{Waterloo}
\state{Ontario}
\country{Canada}
}
\email{sfishme@uwaterloo.ca}
\begin{abstract}
Side channel analysis offers several advantages over traditional machine monitoring methods.
The low intrusiveness, independence with the host, data reliability and difficulty to bypass are compelling arguments for using involuntary emissions as input for security policies.
However, side-channel information often comes in the form of unlabeled time series representing a proxy variable of the activity.
Enabling the definition and enforcement of high-level security policies requires extracting the state or activity of the system.
We present in this paper a novel time series, one-shot classifier called \gls{mad} specifically designed and evaluated for side-channel analysis.
\gls{mad} outperforms other traditional state detection solutions in terms of accuracy and, as importantly, Levenshtein distance of the state sequence.
\keywords{side-channel analysis, security rules, state detection}
\end{abstract}
\maketitle
\section{Introduction}
\gls{ids}s leverage different types of data to detect intrusions.
On one side, most solutions use labeled and actionable data, often provided by the system to protect.
In the software world, this data can be the resource usage \cite{1702202}, program source code \cite{9491765} or network traffic \cite{10.1145/2940343.2940348} leveraged by an \gls{hids} or \gls{nids}.
In the machine monitoring world the input data can be the shape of a gear \cite{wang2015measurement} or the throughput of a pump \cite{gupta2021novel}.
On the other side, some methods consider only information that the system did not intentionally provide.
The system emits these activities by-product through physical mediums called side channels.
Common side-channel information for an embedded system include power consumption \cite{yang2016power} or electromagnetic fields \cite{chawla2021machine}.
For a production machine, common side-channel information include vibrations \cite{zhang2019numerical} or chemical composition of fluids \cite{4393062}.
Side-channel information offer compelling advantages over agent-collected information.
First, the information is difficult to forge.
Because the monitored system is not involved in the data retrieval process, there is no risk that an attacker that compromised the system could easily send forged information.
For example, if an attacker performs any computation on the system --- which is the case of most attacks --- it will unavoidably affect a variety of different side channels.
Second, the side-channel information retrieval process is often non-intrusive and non-disruptive for the monitored system.
Measuring the power consumption of a computer or the vibrations of a machine does not involve the cooperation or modification of the system \cite{10.1145/2976749.2978353}.
This host-independence property is crucial for safety-critical or high-availability applications as the failure of one of the two --- monitored or monitoring --- systems does not affect the other.
These two properties --- reliable data and host-independence --- set physics-based monitoring solution apart with distinct advantages and use-cases.
However, using side-channel data introduces new challenges.
One obstacle to overcome when designing a physics-based solution is the interpretation of the data.
Because the data collection consists of measuring a physical phenomenon, the input data is often a discrete time series.
The values in these time series are not directly actionable.
In some cases, a threshold value is enough to assess the integrity of the system.
In such a case, comparing each value of the time series to the threshold is possible \cite{jelali2013statistical}.
However, whenever a simple threshold is not a reliable factor for the decision, a more advanced analysis of the time series is required to make it actionable.
The state of a machine is often represented by a specific pattern.
This pattern could be, for example, a succession of specific amplitudes or a frequency/average pair for periodic processes.
These patterns are impossible to reliably detect with a simple threshold method.
Identifying the occurrence and position of these patterns makes the data actionable and enables higher-level --- i.e., that work at a higher level of abstraction \cite{tongaonkar2007inferring} --- security and monitoring policies.
For example, a computer starting mid-night or rebooting multiple times in a row should raise an alert for a possible intrusion or malfunction.
Rule-based \gls{ids}s using side channel information require an accurate and practical pattern detection solution.
Many data-mining algorithms assume that training data is cheap, meaning that acquiring large --- labeled --- datasets is achievable without major expense.
Unfortunately, collecting labeled data requires following a procedure and induce downtime for the machine which can be expensive.
Collecting many training samples during normal operations of the machine is more time-consuming as the machine's activity cannot be controlled.
A single sample of each pattern to be detected in the time series is a more convenient data requirement.
Collecting a sample is immediately possible after the installation of the measurement equipment during normal operations of the machine.
In this paper, we present \gls{mad}, a distance-based, one-shot pattern detection method for time series.
\gls{mad} focuses on providing pre-defined state detection from only one training sample per class.
This approach enables the analysis of side-channel information in contexts where the collection of large datasets is impractical.
A context selection algorithm lies at the core of \gls{mad} and yield stable classification of individual sample, important for the robustness of high-level security rules.
In experiments, \gls{mad} outperforms other approaches in accuracy and the Levenshtein distance on various simulated, lab-captured, and public times-series datasets.
We will present the current related work on physics-based security and time series pattern detection in Section~\ref{sec:related}.
Then we will introduce the formal and practical definitions of our solution in Section~\ref{sec:statement} and~\ref{sec:solution}.
Finally, we will present the datasets considered in Section~\ref{sec:dataset} and the results in Section~\ref{sec:results} to finish with a discussion of the solution in Section~\ref{sec:discussion}.
\section{Related Work}\label{sec:related}
Side-channel analysis focuses on extracting information from involuntary emissions of a system.
This topic traces back to the seminal work of Paul C. Kocher.
He introduced power side-channel analysis to extract secrets from several cryptographic protocols \cite{kocher1996timing}.
This led to the new field of side-channel analysis \cite{randolph2020power}.
However, the potential of leveraging side-channel information for defense and security purposes remains mostly untapped.
The information leakage through involuntary emissions through different channels provides insights into the activities of a machine.
Acoustic emissions \cite{belikovetsky2018digital}, heat pattern signature \cite{al2016forensics} or power consumption \cite{10.1145/3571288, gatlin2019detecting, CHOU2014400}, can --- among other side-channels --- reveal information about a machine's activity.
Side-channel information collection generally results in time series objects to analyze.
There exists a variety of methods for analyzing time series.
For signature-based solutions, a specific extract of the data is compared to known-good references to assess the integrity of the host \cite{9934955, 9061783}.
This signature comparison enables the verification of expected and specific sections and requires that the sections of interest can be extracted and synchronized.
Another solution for detecting intrusions is the definition of security policies.
Security policies are sets of rules that describe wanted or unwanted behavior.
These rules are built on input data accessible to the \gls{ids} such as user activity \cite{ilgun1995state} or network traffic \cite{5563714, kumar2020integrated}.
However, the input data requirements must have to apply a rule.
This illustrates the gap between the side-channel analysis methods and the rule-based intrusion detection methods.
To apply security policies to side-channel information, it is necessary to first label the data.
The problem of identifying pre-defined patterns in unlabeled time series is referenced under various names in the literature.
The terms \textit{activity segmentation} or \textit{activity detection} are the most relevant for the problem we are interested in.
The state of the art methods in this domain focus on human activities and leverage various sensors such as smartphones \cite{wannenburg2016physical}, cameras \cite{bodor2003vision} or wearable sensors \cite{uddin2018activity}.
These methods rely on large labeled datasets to train classification models and detect activities \cite{micucci2017unimib}.
For real-life applications, access to large labeled datasets may not be possible.
Another approach, more general than activity detection, uses \gls{cpd}.
\gls{cpd} is a sub-topic of time series analysis that focuses on detecting abrupt changes in a time series \cite{truong2020selective}.
It is assumed in many cases that these change points are representative of state transitions from the observed system.
However, \gls{cpd} is only the first step in state detection as classification of the detected segments remains necessary \cite{aminikhanghahi2017survey}.
Moreover, not all state transitions trigger abrupt changes in time series statistics, and some states include abrupt changes.
Overall, \gls{cpd} only fits a specific type of problem with stable states and abrupt transitions.
Neural networks raised in popularity for time series analysis with \gls{rnn}.
Large \gls{cnn} can perform pattern extraction in long time series, for example in the context of \gls{nilm} \cite{8598355}.
\gls{nilm} focuses on the problem of signal disaggregation.
In this problem, the signal comprises an aggregate of multiple signals, each with their own patterns \cite{angelis2022nilm}.
This problem shares many terms and core techniques as this paper but the nature of the input data makes \gls{nilm} a distinct area of research.
The specific problem of classification with only one example of each class is called one-shot --- or few-shot --- classification.
This topic focuses on pre-extracted time series classification with few training samples, often using multi-level neural networks \cite{10.1145/3371158.3371162, 9647357}.
However, in the context of side-channel analysis, a time series contains many patterns that are not extracted.
Moreover, neural-based approaches lack interpretability, which can cause issues in the case of unforeseen time series patterns.
Simpler approaches with novelty detection capabilities are required when the output serves as input for rule-based processing.
Finally, Duin et. al. investigate the problem of distance-based few-shot classification \cite{duin1997experiments}.
They present an approach based on the similarity between new objects and a dissimilarity matrix between items of the training set.
The similarities are evaluated with Nearest-Neighbor rules or \gls{svm}.
Their approach bears some interesting similarities with the one presented in this paper.
However, they evaluate their work on the recognition of handwritten numerals, which is far from the use case we are interested in.
\section{Problem Statement}\label{sec:statement}
%\gls{mad} focuses on detecting the state of a time series at any point in time.
We consider the problem from the point of view of multi-class, mono-label classification problem \cite{aly2005survey} for every sample in a time series.
The problem is multi-class because multiple states can occur in one-time series, and therefore any sample is assigned one of multiple states.
The problem is mono-label because only one state is assigned to each sample.
The classification is a mapping from the samples space to the states space.
\begin{problem-statement}[\gls{mad}]
Given a discretized time series $t$ and a set of patterns $P=\{P_1,\dots, P_n\}$, identify a mapping $m:\mathbb{N}\longrightarrow P\cup \lambda$ such that every sample $t[i]$
maps to a pattern in $P\cup \lambda$ with the condition that the sample matches an occurrence of the pattern in $t$.
\end{problem-statement}
The time series $t: \mathbb{N} \longrightarrow \mathbb{R}$ is a finite, discretized, mono-variate, real-valued time series.
The patterns (also called training samples) $P_j \in P$ are of the same type as $t$.
Each pattern $P_j$ can take any length denoted $N_j$.
A sample $t[i]$ \textit{matches} a pattern $P_j \in P$ if there exists a substring of $t$, the length of $P_j$, that includes the sample, such that a similarity measure between this substring and $P_j$ is below a pre-defined threshold.
The pattern $\lambda$ is the \textit{unknown} pattern assigned to the samples in $t$ that do not match any of the patterns in $P$.
\begin{figure}
\centering
\includegraphics[width=0.45\textwidth]{images/overview.pdf}
\caption{Illustration of the sample distance from one sample to each training example in a 2D space.}
\label{fig:overview}
\end{figure}
\section{Proposed Solution: MAD}\label{sec:solution}
\gls{mad}'s core idea separates it from other traditional sliding window algorithm.
In \gls{mad}, the sample window around the sample to classify dynamically adapts for optimal context selection.
This principle influences the design of the detector and requires the definition of new distance metrics.
Because the patterns lengths may differ, our approach requires distance metrics that are robust to length variations.
%For the following explanation, the pattern set $P$ refers to the provided patterns only $\{P\setminus \lambda\}$ --- unless specified otherwise.
We first define the fundamental distance metric as the normalized Euclidean distance between two-time series $a$ and $b$ of the same length $N_a=N_b$
\begin{equation}
nd(a,b) = \dfrac{EuclideanDist(a,b)}{N_a}
\end{equation}
Using this normalized distance $nd$, we define the distance from a sample $t[i]$ to a pattern $P_j \in P$.
This is the sample distance $sd$ defined as
\begin{equation}\label{eq:sd}
sd(i,P_j) = \min_{k\in [i-N_j,i+N_j])}(nd(t[i-k:i+k],P_j))
\end{equation}
%with $P_j$ the training sample corresponding to the state $j$, and $t$ the complete time series.
Computing the distance $sd(i,P_j)$ requires to: (1) select every substring of $t$ of length $N_j$ that contains the sample $t[i]$, (2) evaluate their normalized distance to the pattern $P_j$, and (3) consider $sd(i,P_j)$ as the smallest of these distances.
For simplicity, Equation~\ref{eq:sd} omits the border conditions for the range of $k$.
When the sample position $i$ is less than $N_j$ or greater than $N_t-N_j$, the range adapts to only consider valid substrings.
Our approach uses a threshold-based method to decide what label to assign to a sample.
For each sample in $t$, the algorithm compares the distance $sd(i,P_j)$ to the threshold $T_j$.
The sample receives the label $j$ associated with the pattern $P_j$ that results in the smallest distance $sd(i,P_j)$ with $sd(i,P_j)<T_j$.
The minimum distance from the pattern $P_j$ to all other patterns $P_l$ with $l\neq i$ --- denoted $ID_j$ --- forms the basis of the threshold $T_j$.
Intuitively, the patterns in $P$ represent most of the patterns expected in the trace.
Thus, to decide that a substring matches a pattern $P_j$, it must match $P_j$ better than any other pattern $P_l$ with $l\neq i$ does.
Otherwise, the distance metric justifies to assign the label of $P_j$ to a pattern of another label instead of the substring, which is counter-intuitive.
The inter-distance between $P_j$ to $P_l$, defined as
\begin{equation}
ID(P_j,P_l) = \min_{i\in[0,N_l-N_j]} nd(P_j,P_l[i:i+N_j])
\end{equation}
represents the smallest distance between $P_j$ and any substring of length $N_j$ from $P_l$ --- with $N_l>N_j$.
If $N_l<N_j$, then $ID(P_j,P_l) = ID(P_l,P_j)$.
In other words, when computing the inter-distance between two patterns, we slide the short pattern along the length of the long one and compute the normalized distance at every position to finally consider only the smallest of these distances as the inter-distance.
To fully define the threshold $T_j$, we introduce the shrinkage coefficient $\alpha$.
This coefficient, multiplied with the smallest inter-distance $ID_j$, forms the threshold $T_j$.
\begin{equation}
T_j = \alpha\times ID_j = \alpha \min_{l\in[1;k];l\neq j} \{ID(e_j,e_l)\}
\end{equation}
The shrinkage coefficient $\alpha$ provides some control over the confidence of the detector.
A small value shrinks the range of capture of each label more and will leave more samples classified as \textit{unknown}.
A large value leaves less area for the \textit{unknown} state and forces the detector to choose a label, even for samples unlike any pattern.
The \textit{unknown} label enables the detector to carry over the information of novelty to the output.
In cases where a substring does not resemble any pattern --- for example, in cases of anomalies, or unforeseen activities ---, the ability to inform of novel patterns enables a more granular definition of security policies.
Finally, we assign to each sample the label of the closest pattern with a distance lower than its threshold.
\begin{equation}
s_i = \underset{j\in[1,k]}{\arg\min}(sd(i,e_j) \textrm{ with } sd(i,e_j)<T_j)
\end{equation}
In the case where no distance is below the threshold, the sample defaults to the \textit{unknown} state.
\subsection{Algorithm}
The algorithm for \gls{mad} follows three steps:
\begin{enumerate}
\item Compute the inter-distances and threshold values for the pattern set. The algorithm can reuse the result from this step for all following detection with the same pattern set.
\item For each sample $t[i]$, compute the sample distance to each pattern $\{sd(i,p) \forall p\in P\}$.
\item Select the label by comparing the sample distances to the threshold.
\end{enumerate}
However, directly implementing this suite of operations is not optimal as it requires computing the distance from any substring to any pattern multiple times --- exactly once per sample in the substring.
A more efficient solution considers each substring only once.
In other words, iterating over the patterns rather than the samples is more efficient as it replaces distance computations with comparison operations.
The efficient implementation follows the operations:
\begin{enumerate}
\item Compute the inter-distances and threshold values for the pattern set --- no optimization at this step.
\item For every pattern $P_j$ of length $N_j$ in $P$, consider every substring of length $N_j$ in $t$ and compute the normalized distance $nd(t[i:i+N_j],P_j)$.
\item For every sample in the substring, store the minimum of the previously stored and newly computed normalized distance as the sample distance.
\item Select the label by comparing the sample distances to the thresholds.
\end{enumerate}
This results in the same final value for the sample distance $sd(i,P_j)$ with less computations of the normalized distance --- at the expense of cheaper comparison operations.
Algorithm~\ref{alg:code} presents the implementation's pseudo-code.
\begin{algorithm}
\caption{Pseudo code for state detection.}
\label{alg:code}
\begin{algorithmic}[1]
\Require $t$ the time series of length $N_t$, $P$ the set $n$ of patterns, $\alpha$ the shrinkage coefficient.
\BeginBox
\LComment{First part: computation of the thresholds.}
\State $interDistances \gets nilMatrix(n,n)$
\State $thresholds \gets nilList(n)$
\For{$i \in [0,n-1]$}
\For{$j \in [0,n-1]$}
\If{$i\neq j$ and $interDistance[i,j] \neq Nil$}
\State $dist \gets ID(P[i],P[j]$
\State $interDistances[i,j] \gets dist$
\State $interDistances[j,i] \gets dist$
\EndIf
\EndFor
\State $thresholds[i] \gets min(interDistances[i,:])$
\EndFor
\EndBox
\BeginBox
\LComment{Second part: computation of the distances.}
\State $distances \gets nilMatrix(n,N_t)$
\State $labels \gets nilList(N_t)$
\For{$i \in [0,n-1]$}
\For{$k \in [0,N_t-1]$}
\State $dist \gets nd(t[k:k+N_{P_i}], P_i)$
\For{$l\in [0,N_t-1]$}
\State $distances[i,k] \gets min(distances[i,k], dist)$
\EndFor
\EndFor
\EndFor
\EndBox
\BeginBox
\LComment{Third part: selection of the label based on the distances.}
\For{$k \in [0,N_t-1]$}
\State $rowMin \gets Nil$
\State $distanceMin \gets \infty$
\For{$i \in [0,n-1]$}
\If{$distances[i,k] \leq thresholds[i]$}
\State $rowMin \gets i$
\State $distanceMin \gets distances[i,k]$
\EndIf
\EndFor
\State $labels[k] \gets rowMin$
\EndFor
\EndBox
\State \Return $labels$
\end{algorithmic}
\end{algorithm}
\subsection{Analysis}
\textbf{Time-Efficiency:}
\agd{Better time efficiency analysis and comparison with the efficiency of \gls{1nn}}
The time efficiency of the algorithm is expressed as a function of the number of normalized distance computations and the number of comparison operations.
Each part of the algorithm has its own time-efficiency expression with Algorithm~\ref{alg:code} showing each of the three parts.
The first part, dedicated to the threshold computation, is polynomial in the number of patterns and linear in the length of each pattern.
The second part, in charge of the distances computation, is linear in the number of patterns, the length of the time series, and the length of each pattern.
Finally, the third part, focusing on the final label selection, is linear in both the length of the time series and the number of patterns.
Overall, the actual detection computation --- second and third parts --- is linear in all input sizes.
Adding an additional value to the time series triggers the computation of one more distance value per pattern, hence the linear relationship.
Similarly, lengthening a pattern by one triggers one more comparison operation for each substring of the time series, hence the linear relationship.
Concluding from the analysis, the additional operations introduced by \gls{mad} over the traditional \gls{1nn} do not significantly impact the time efficiency of the detection that remains linear.
\textbf{Termination:}
Every part of the algorithm terminates.
The first part iterates on the patterns with two nested loops over the samples of two patterns.
No instruction modifies the patterns that are all of finite lengths.
Thus the loops always terminate.
The second part iterates over the patterns and the time series with two nested loops.
Similarly to the first part, the time series is finite and never altered.
Thus the second part also terminates.
Finally, the third part uses the sames loops as the second and also terminates.
Overall, \gls{mad} always terminates for any finite time series and finite set of finite patterns.
\textbf{Monotony of number of unknown sample}\agd{find better title}
Explain that the number of unknown sample is monotonic as a function of alpha.
Also, a sample that is classified as unknown will always remain unknown if alpha decreases.
\section{Evaluation}
The evaluation of \gls{mad} consists in the detection of the states for time series from various machines.
We evaluate the performance of the proposed solution against other traditional methods to illustrate the capabilities and advantages of \gls{mad}.
\subsection{Performance Metrics}
We considered two metrics to illustrate the performance of \gls{mad}.
Performance evaluations of labeling systems traditionally use the accuracy \cite{grandini2020metrics}.
Accuracy is defined as the number of correctly classified samples divided by the total number of samples.
However, accuracy only illustrates a part of the performances.
In the context of state detection, we are interested in taking actions depending on the state of a system.
Detecting the start and stop times of each state is not as important as detecting the correct list of occurrences of states.
We are interested in making sure that the state is detected, even at the cost of some time inaccuracy.
The Levenshtein distance~\cite{4160958} illustrates the classifier's performance at detecting the correct list of states from a time series.
The Levenshtein distance is defined as the number of single-character edits --- insertions, deletions or substitutions --- between two strings.
The Levenshtein distance could use the raw detected labels list as input.
However, the raw label list embeds state detection time information, which the Levenshtein distance is very sensitive to.
We first reduce the ground truth and the detected labels by removing immediate duplicate of labels.
This reduction removes timing information yet conserves the global order of state occurrences.
The Levenshtein distance between the ground truth and the detected labels is low if every state occurrence is correctly detected.
Similarly the metric is high if states occurrences are missed, added, or miss-detected.
To remove length bias and make the metric comparable across datasets, we normalize the raw Levenshtein distance and define it as
\begin{equation}
levacc = \dfrac{Levenshtein(rgtruth,rlabels)}{max(rN_t,rN_l)}
\end{equation}
with $rgtruth$ and $rlabels$ respectively the reduced ground truth and reduced labels and $rN_t$ and $rN_l$ their length.
The Levenshtein distance provides complementary insights in the performance of the detection in this specific use case.
Figure~\ref{fig:metrics} illustrates the impact of an error on both metrics.
It is important to notice that zero represents the best Levenshtein distance and one the worst --- contrary to the accuracy.
\begin{figure}
\centering
\includegraphics[width=0.49\textwidth]{images/metric.pdf}
\caption{Accuracy or Levenshtein distance alone are unable to illustrate all types of error. We consider both to provide a better evaluation of the performances.}
\label{fig:metrics}
\end{figure}
\subsection{Dataset}\label{sec:dataset}
\agd{include more datasets from REFIT. One per house would be perfect but simply more is already good. Add in annexe why other are rejected.}
We evaluate the performance of \gls{mad} against eight time series.
One is a simulated signal composed of sine waves of varying frequency and average.
Four were captured in a lab environment on consumer-available machines (two NUC PCs and two wireless routers).
Finally, two were extracted from the REFIT dataset \cite{278e1df91d22494f9be2adfca2559f92} and correspond to home appliances during real-life use.
Table~\ref{tab:dataset} presents the times series and their characteristics.
\begin{table}
\centering
\begin{tabular}{lcc}
Name & length & Number of states\\
\toprule
NUCPC-0 & 22700 & 11\\
NUCPC-1 & 7307 & 8\\
GENERATED & 15540 & 18\\
WAP-ASUS & 26880 & 18\\
WAP-LINKSYS & 22604 & 18\\
REFIT-H4A4 & 5366 & 17\\
REFIT-H4A1 & 100000 & 142\\
\bottomrule
\end{tabular}
\caption{Characteristics of the machines in the evaluation dataset.}
\label{tab:dataset}
\end{table}
The dataset aims to provide a diverse machine and state patterns to evaluate the performances.
For each time series, we generated the ground truth by manually labeling all sections of the time series using a custom-made range selection tool based on a Matplotlib \cite{Hunter:2007} application.
The dataset is publicly available \cite{zenodo}.
\textbf{Lab Captures:}
NUCPC-0, NUCPC-1, WAP-ASUS and WAP-LINKSYS correspond to lab-captured machine activity power consumption.
A commercial solution \cite{palitronica}, placed in series with the main power cable, measures the global power consumption of the machine.
We considered two types of machines.
The NUCPC-* are small form factor general-purpose computers.
The WAP-* are wireless access points from two different brands.
The states to detect on these computing machines are \textit{powered off}, \textit{boot sequence}, and \textit{on}.
With these states, it is possible to set up many security rules such as: \textit{"machine on after office hours"}, \textit{"X reboots in a row"} or \textit{"Coincident shutdown of Y machines within Z minutes"}.
\textbf{GENERATED:}
An algorithm generated the GENERATED time series following 3 steps.
First, the algorithm randomly selects multiple frequency/average pairs.
Second, the algorithm generates 18 segments, by selecting a pair and a random length.
Finally, the algorithm concatenates the segments to form the complete time series.
The patterns correspond to a minimal length example of each pair.
This time series illustrates the capabilities of the proposed solution in a case where a simple threshold would fail.
\textbf{REFIT:}
In 215, D. Murray et. al \cite{278e1df91d22494f9be2adfca2559f92} created the REFIT dataset for \gls{nilm} research.
This dataset is now widely used in this research area.
REFIT is composed of the global consumption of 20 houses along with the specific consumption of nine appliances per house.
The global house consumption does not fit the problem statement of this paper as multiple patterns overlap.
However the individual consumption of some appliances fit the problem statement and two were selected.
The REFIT-H4A1 is the first appliance of the fourth house and corresponds to a fridge.
The REFIT-H4A4 is the fourth appliance of the fourth house and corresponds to a washing machine.
The activity in this second time series was very sparse with long periods without consumption.
The no consumption sections: are not a challenging --- i.e., all detectors perform well on this type of pattern ---, make the manual labeling more difficult, and level all results up.
For this reason we removed large sections of inactivity between active segments to make the time series more challenging without tempering with the order of detector performances.
\input{refit_table}
\subsection{Alternative Methods}
\agd{Explain better why the alternative methods are chosen.}
\agd{explain how the svm and mlp are trained.}
We implemented three alternative methods to compare with the performance of \gls{mad}.
These methods are commonly deployed to detect patterns in a time series.
The methods are: a \gls{1nn} detector, an \gls{svm} classifier, and an \gls{mlp} classifier.
More complex solutions like \gls{rnn} or \gls{cnn} show good performances on time series analysis but require too much data to be applicable to one-shot classification.
All alternative methods rely on a sliding window to extract substring to classify.
The window is centred around the sample.
This choice --- or any other placement of the window --- implies that some samples corresponding to the length of the longest pattern remain unclassified toward the ends.
The stride of the window is a single sample to consider every possible window.
Each extracted window is sent to the classifier, and the result is applied to the sample at the center of the window.
The alternative detectors are not meant to handle variable-size time series as input.
For the \gls{svm} and \gls{mlp} detectors, the window size is shorter than the shortest pattern.
The training sample extraction algorithm slides the window along all patterns to extract all possible substrings.
These substrings constitute the training dataset with multiple samples per pattern.
The \gls{1nn} considers one window per pattern length around each sample.
Every window is compared to its pattern, and the normalized Euclidean distance is considered for the decision.
Overall, it is possible to adapt the methods to work with variable length patterns, but \gls{mad} is the only pattern-length-agnostic method by design.
\subsection{Results}\label{sec:results}
The benchmark consists in detecting the label of every sample for each time series with each method and compute the performance metrics.
The detectors that require training (\gls{svm} and \gls{mlp}) were re-trained for every evaluation.
Figure~\ref{fig:res} presents the results.
\gls{mad} is consistently as or more accurate than the alternative method.
The Levenshtein distance illustrates how \gls{mad} provides a smoother and less noisy labeling.
This stability introduces less state detection errors that could falsly trigger security rules.
With both performances metrics combined, \gls{mad} outperforms the other methods.
\begin{figure*}
\centering
\includegraphics[width=\textwidth]{images/dsd_acc.pdf}
\caption{Performances of the different methods on all the datasets.}
\label{fig:res}
\end{figure*}
\subsection{Influence of $\alpha$}
The shrink coefficient $\alpha$ is the only hyperparameter of the detector.
Its default value is one.
$\alpha$ controls the threshold of similarity that a substring should cross to get qualified as a match to a pattern.
$\alpha$ takes its value in $\mathbb{R}_*^+$.
The default value for $\alpha$ is one.
This value follows the intuitive reasoning presented in Section~\ref{sec:solution}.
To better understand the influence of the shrink coefficient, the algorithm can be perceived as a 2D area segmentation problem.
Let us consider the 2D plane where each pattern has a position based on its shape.
A substring to classify also has a position in the plane and a distance to each pattern (see bottom part of Figure~\ref{fig:overview}).
During classification, the substring takes the label of the closest pattern.
For any pattern $P_j$, the set of positions in the plane that are assigned to $P_j$ --- i.e., the set of positions for which $P_j$ is the closest pattern --- is called the area of attraction of $P_j$.
In a classic \gls{1nn} context, every point in the plane is in the area of attraction of one pattern.
This infinite area of attraction is not a desirable feature in this context.
Let us consider now a time series exhibiting anomalous or unforeseen behavior.
Some substrings in this time series do not resemble any of the provided pattern.
In an infinite area of attraction context, the anomalous points are assigned to a pattern, even if they poorly match it.
As a result, the behavior of the security rule can become unpredictable as anomalous points can receive a seemingly random label.
A more desirable behavior of the state detection system is to inform of the presence of unpredicted behavior.
This behavior naturally emerges when the areas of attraction of the patterns are limited to a finite size.
The shrink coefficient $\alpha$ --- through the modification of the threshold $T_j$ --- provides control over the shrink of the areas of attraction.
The lower the value of $\alpha$, the smaller the areas of attraction around each sample.
Applying a coefficient to the thresholds produces a reduction of the radius of the area of attraction, not an homothety of the initial areas.
In other words, the shrink does not preserve the shape of the area.
For a value $\alpha < 0.5$, all areas become disks --- in the 2D representation --- and all shape information are lost.
The impact of the $\alpha$ coefficient on the classification is monotonic and predictable.
Because $\alpha$ influences the thresholds, changing $\alpha$ results in moving the transitions in the detected labels.
In other words, a lower value of $\alpha$ expands the unknown segments while a higher value shrinks them until they disappear.
Figure~\ref{fig:alpha_impact} illustrates the impact $\alpha$ on the width of unknown segments.
The impact of $\alpha$ on the number of unknown sample is also monotonic.
\begin{proof}
We prove the monotony of the number of unknown samples as a function of $\alpha$ by induction.
The base case is $\alpha=0$.
In this case, the threshold for every pattern $P_j\in P$ is $T_j = \alpha\times ID_j = 0$.
With every $T_j=0$, no sample can have a distance below the threshold and every sample is labeled as \textit{unknown}.
For the induction case, let us consider $\alpha$ increasing from the value $\alpha_0$ to $\alpha_1 = \alpha_0 + \delta$ with $\delta \in \mathbb{R}_*^+$.
The increasing of $\alpha$ induces the increase of every threshold $T$ from the value $T_0$ to $T_1$
\begin{equation}
\alpha_0 <\alpha_1 \rightarrow T_0 < T_1
\end{equation}
For every value of every threshold $T$ we can define a set of all samples below the threshold as $S_T$.
When a threshold increases from $T_0$ to $T_1$, all the samples in $S_{T_0}$ also belong in $S_{T_1}$ by the transitivity of order in $\mathbb{R}_*^+$.
It is also possible for samples to belong to $S_{T_1}$ but not to $S_{T_0}$ if their distance falls between $T_0$ and $T_1$.
Hence, $S_{T_0}$ is a subset of $S_{T_1}$ and the cardinality of $S_T$ as a function of $T$ is monotonically non-decreasing.
We conclude that the number of unknown samples --- i.e.,samples above every thresholds --- as a function of $\alpha$ is monotonically non-increasing.
\end{proof}
Figure~\ref{fig:alpha} presents the number of unknown samples in the classification of the NUCPC-1 time series based on the value of $\alpha$.
\begin{figure}
\centering
\includegraphics[width=0.49\textwidth]{images/alpha.pdf}
\caption{Evolution of the number of unknown samples based on the value of the shrink coefficient $\alpha$.}
\label{fig:alpha}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=0.49\textwidth]{images/alpha_impact.pdf}
\caption{Behavior of the classifier with different values of $\alpha$. A lower value of $\alpha$ expands the unknown sections (orange sections)}
\label{fig:alpha_impact}
\end{figure}
\begin{figure*}
\centering
\includegraphics[width=\textwidth]{images/areas.pdf}
\caption{2D visualization of the areas of capture around each pattern as $\alpha$ changes. When $\alpha \ggg 2$, the areas of capture tends to equal these of a classic \gls{1nn}.}
\label{fig:areas}
\end{figure*}
\section{Discussion}\label{sec:discussion}
In this section we highlight specific aspects of the proposed solution.
Side-channel based state detection enables a more robust security policy enforcement.
Let us consider the classic case of some security policies in a company.
The office hours are set between 8 am and 8 pm.
Outside of office hours, a security policy specifies that no computer should be on --- or should not be awake.
The traditional way of enforcing such policies would be to have a server evaluates the state of each computer remotely (via a PING command, for example) or to have an agent on each computer sending the state to a server.
Both cases are highly susceptible to bypass.
A local attacker could boot a system on a secondary OS and immediately disable all agents on the machine.
A remote attacker could infect the machine and forge the reported data.
Any attacker that can disable the network connection would make the activities invisible to the policy enforcement system.
All of these methods have no impact on a side-channel intrusion detection system.
Whatever the motivations of the attacker, there are no malicious operations that do not require the machine to consume power.
The capability to detect the state of the system independently of the willingness of the system itself is a major step forward in enabling robust security policies enforcement on computing devices.
The proposed method have some limitations that are important to acknowledge.
The current version of \gls{mad} is tailored for a specific use case.
The goal is to enable high-level security policies with a secure and reliable state detection of a machine from a time series.
The purpose of the state detection is not anomaly or novelty detection at the time series level.
For this reason, the patterns to be detected by \gls{mad} bear some limitations.
First, the patterns must be distinct.
If two patterns share a significant portion of time series, \gls{mad} will have an issue leading to unstable results.
Second, the states must be hand selected.
The data requirement is extremely low --- only one sample per pattern --- so the selected samples must be reliable.
For now, a human expert decided on the best patterns to select.
While there is nothing particularly difficult in the selection, it is still a highly manual process that we hope to automatize with future iterations.
Finally, the states must be consistent.
If a state has an unpredictable signature --- i.e., each occurence display a significantly different pattern ---, \gls{mad} will not be able to detect the occurences reliably.
\gls{mad} is not limited to one-shot cases and can leverage more labeled data.
\gls{mad} is based on a \gls{1nn}, so the evolution to \gls{knn} is natural.
If more than one pattern is available for one state, \gls{mad} will apply the same detection method only with multiple patterns leading to the same label.
The number of training samples per class can be unbalanced, and the training samples within a class can have different lengths.
\gls{mad} preserves the versatility of a \gls{knn} solution in this regard.
\gls{mad} remains time-efficient compared to a classic \gls{1nn}.
Although there are more operations to perform to evaluate all possible windows around a sample, the impact on detection time is small.
Over all the datasets considered, the time for \gls{mad} was, on average, 14\% higher than the time for the \gls{1nn}.
\gls{mad} is also slower than \gls{svm} and faster than \gls{mlp}, but comparison to other methods is less relevant as computation time is highly sensitive to implementation, and no optimization was attempted.
Finally, because \gls{mad} is distance-based and window-based, parallelization is naturally applicable and can significantly reduce the processing time.
\section{Conclusion}
We present \gls{mad}, a novel solution to enable high-level security policy enforcement from side channel information.
Leveraging side channel information requires labeling samples to discover the state of the monitored system.
Additionally, in the use cases where side-channels are leveraged, collecting large labeled datasets can be challenging.
\gls{mad} is designed around three core features: low data requirement, flexibility of the detection capabilities, and stability of the results.
Built as a variation of a traditional \gls{1nn}, \gls{mad} uses a dynamic window placement that always provides the most relevant context for sample classification.
One hyper-parameter, $\alpha$, controls the confidence of the detector and the trade-off between un-classified and miss-classified samples.
The comparison to traditional state detection methods highlights the potential of \gls{mad} for the pre-processing of raw data for security applications.
\bibliographystyle{splncs04}
\bibliography{biblio}
\end{document}

19
DSD/qrs/refit_table.tex Normal file
View file

@ -0,0 +1,19 @@
\begin{table}[h]
\begin{tabular}{@{}cll@{}}
\toprule
\multicolumn{1}{l}{House} & Appliance & States \\ \midrule
\multirow{9}{*}{1} & 1.Fridge & ON (high), OFF (low).\\
& 2.Chest Freezer & ON (high), OFF (low).\\
& 3.Upright Freezer & ON (high), OFF (low).\\
& 4.Tumble Dryer & No clearly defined state.\\
& 5.Washing Machine & ON, Spike, End.\\
& 6.Dishwasher & ON (pattern of 3 high), OFF (low).\\
& 7.Computer Site & ON, High Load, OFF.\\
& 8.Television Site & ON, OFF, SLEEP.\\
& 9.Electric Heater & Heating, Stable Temperature, OFF.\\
\multirow{4}{*}{2} & & \\
& & \\
& & \\
& &
\end{tabular}
\end{table}