2019

  1. T. De Matteis, G. Mencagli, D. De Sensi, M. Torquati, and M. Danelutto, “Gasser: an auto-tunable system for general sliding-window streaming operators on gpus,” Ieee access, vol. 7, pp. 48753-48769, 2019. doi:10.1109/ACCESS.2019.2910312
    [Abstract] [BibTeX]
    @ARTICLE{8688411,
    author={De Matteis, Tiziano and Mencagli, Gabriele and De Sensi, Daniele and Torquati, Massimo and Danelutto, Marco},
    journal={IEEE Access},
    title={GASSER: An Auto-Tunable System for General Sliding-Window Streaming Operators on GPUs},
    year={2019},
    volume={7},
    number={},
    pages={48753-48769},
    keywords={graphics processing units;optimisation;parallel processing;query processing;popular streaming system;scalable streaming system;completely general queries;incremental queries;GASSER;auto-tunable system;general sliding-window streaming operators;stream processing systems;high-volume data streams;commodity machines;distributed architectures;coprocessors;performance efficiency;data stream;parallelism;configuration parameters;optimal value;graphical processing units;data-parallel tasks;GPU processing;different processing models;streaming paradigm advocates;tuple-at-a-time processing model;sliding-window operators;completely general functions;parallel processing;nonincremental queries;existing GPU-based;auto-tuning approach;Microsoft Windows;Graphics processing units;Parallel processing;Throughput;Windows;Task analysis;Prototypes;Data stream processing;sliding-window queries;GPU processing;autotuning;self-configuring systems},
    doi={10.1109/ACCESS.2019.2910312},
    ISSN={2169-3536},
    month={},
    openaccess={https://ieeexplore.ieee.org/document/8688411},
    abstract={Today's stream processing systems handle high-volume data streams in an efficient manner. To achieve this goal, they are designed to scale out on large clusters of commodity machines. However, despite the efficient use of distributed architectures, they lack support to co-processors like graphical processing units (GPUs) ready to accelerate data-parallel tasks. The main reason for this lack of integration is that GPU processing and the streaming paradigm have different processing models, with GPUs needing a bulk of data present at once while the streaming paradigm advocates a tuple-at-a-time processing model. This paper contributes to fill this gap by proposing Gasser, a system for offloading the execution of sliding-window operators on GPUs. The system focuses on completely general functions by targeting the parallel processing of non-incremental queries that are not supported by the few existing GPU-based streaming prototypes. Furthermore, Gasser provides an auto-tuning approach able to automatically find the optimal value of the configuration parameters (i.e., batch length and the degree of parallelism) needed to optimize throughput and latency with the given query and data stream. The experimental part assesses the performance efficiency of Gasser by comparing its peak throughput and latency against Apache Flink, a popular and scalable streaming system. Furthermore, we evaluate the penalty induced by supporting completely general queries against the performance achieved by the state-of-the-art solution specifically optimized for incremental queries. Finally, we show the speed and accuracy of the auto-tuning approach adopted by Gasser, which is able to self-configure the system by finding the right configuration parameters without manual tuning by the users.},
    } 

2018

  1. A. Conte, T. De Matteis, D. De Sensi, R. Grossi, A. Marino, and L. Versari, “D2k: scalable community detection in massive networks via small-diameter k-plexes,” in Proceedings of the 24th acm sigkdd international conference on knowledge discovery &\#38; data mining, New York, NY, USA, 2018, pp. 1272-1281. doi:10.1145/3219819.3220093
    [Abstract] [BibTeX] [URL] [Download PDF]
    @inproceedings{kdd:18,
    author = {Conte, Alessio and De Matteis, Tiziano and De Sensi, Daniele and Grossi, Roberto and Marino, Andrea and Versari, Luca},
    title = {D2K: Scalable Community Detection in Massive Networks via Small-Diameter k-Plexes},
    booktitle = {Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \&\#38; Data Mining},
    series = {KDD '18},
    year = {2018},
    isbn = {978-1-4503-5552-0},
    location = {London, United Kingdom},
    pages = {1272--1281},
    numpages = {10},
    url = {http://doi.acm.org/10.1145/3219819.3220093},
    doi = {10.1145/3219819.3220093},
    acmid = {3220093},
    publisher = {ACM},
    address = {New York, NY, USA},
    keywords = {community discovery, graph enumeration, k-plexes, parallel programming},
    openaccess = {https://dl.acm.org/authorize?N666390},
    abstract = {This paper studies kplexes, a well known pseudo-clique model for network communities. In a kplex, each node can miss at most $k-1$ links. Our goal is to detect large communities in today's real-world graphs which can have hundreds of millions of edges. While many have tried, this task has been elusive so far due to its computationally challenging nature: kplexes and other pseudo-cliques are harder to find and more numerous than cliques, a well known hard problem. We present D2K, which is the first algorithm able to find large kplexes of very large graphs in just a few minutes. The good performance of our algorithm follows from a combination of graph-theoretical concepts, careful algorithm engineering and a high-performance implementation. In particular, we exploit the low degeneracy of real-world graphs, and the fact that large enough kplexes have diameter~2. We validate a sequential and a parallel/distributed implementation of D2K on real graphs with up to half a billion edges.},
    videopitch = {https://www.youtube.com/watch?v=zF2Hz1wq9eM},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2018_KDD.pdf},
    } 

  2. D. De Sensi, T. De Matteis, and M. Danelutto, “Simplifying self-adaptive and power-aware computing with nornir,” Future generation computer systems, p. -, 2018. doi:https://doi.org/10.1016/j.future.2018.05.012
    [Abstract] [BibTeX] [URL] [Download PDF]
    @article{nornir:fgcs18,
    title = {Simplifying self-adaptive and power-aware computing with Nornir},
    journal = {Future Generation Computer Systems},
    volume = {},
    number = {},
    pages = { - },
    year = {2018},
    note = {},
    issn = {0167-739X},
    doi = {https://doi.org/10.1016/j.future.2018.05.012},
    url = {https://www.sciencedirect.com/science/article/pii/S0167739X17326699},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2018_FGCS.pdf},
    author = {De Sensi, Daniele and De Matteis, Tiziano and Danelutto, Marco},
    keywords = {Self-adaptive, Power-aware, Quality of service, Data stream processing, Fog computing, Parallel computing},
    abstract = {Self-adaptation is an emerging requirement in parallel computing. It enables the dynamic selection of resources toallocate to the application in order to meet performance and power consumption requirements. This is particularly relevant in Fog Applications, where data is generated by a number of devices at a varying rate, according to users’ activity. By dynamically selecting the appropriate number of resources it is possible, for example, to use at each time step the minimum amount of resources needed to process the incoming data. Implementing such kind of algorithms may be a complex task, due to low-level interactions with the underlying hardware and to non-intrusive and low-overhead monitoring of the applications. For these reasons, in this paper we propose Nornir, a C++-based framework, which can be used to enforce performance and power consumption constraints on parallel applications running on shared memory multicores. The framework can be easily customized by algorithm designers to implement new self-adaptive policies. By instrumenting the applications in the \{PARSEC\} benchmark, we provide to strategy designers a wide set of applications already interfaced to Nornir. In addition to this, to prove its flexibility, we implemented and compared several state-of-the-art existing policies, showing that Nornir can also be used to easily analyze different algorithms and to provide useful insights on them.},
    } 

  3. D. De Sensi, T. De Matteis, and M. Danelutto, “Nornir: a customizable framework for autonomic and power-aware applications,” in Euro-par 2017: parallel processing workshops, 2018, pp. 42-54. doi:10.1007/978-3-319-75178-8_4
    [Abstract] [BibTeX] [URL] [Download PDF] [Slides]
    @InProceedings{nornir:autodasp17,
    author="De Sensi, Daniele
    and De Matteis, Tiziano
    and Danelutto, Marco",
    editor="Heras, Dora B.
    and Bouge, Luc",
    title="Nornir: A Customizable Framework for Autonomic and Power-Aware Applications",
    booktitle="Euro-Par 2017: Parallel Processing Workshops",
    year="2018",
    publisher="Springer International Publishing",
    pages="42--54",
    abstract="A desirable characteristic of modern parallel applications is the ability to dynamically select the amount of resources to be used to meet requirements on performance or power consumption. In many cases, providing explicit guarantees on performance is of paramount importance. In streaming applications, this is related with the concept of elasticity, i.e. being able to allocate the proper amount of resources to match the current demand as closely as possible. Similarly, in other scenarios, it may be useful to limit the maximum power consumption of an application to do not exceed the power budget. In this paper we propose Nornir, a customizable C++ framework for autonomic and power-aware parallel applications on shared memory multicore machines. Nornir can be used by autonomic strategy designers to implement new algorithms and by application users to enforce requirements on applications.",
    isbn="978-3-319-75178-8",
    doi="10.1007/978-3-319-75178-8_4",
    slides = "https://docs.google.com/presentation/d/1PJ9gn_jIdApjrK1-wB3gnAB2PPYOsocxrqMTB96HI2E/edit?usp=sharing",
    pdf = "http://pages.di.unipi.it/desensi/assets/pdf/2017_AutoDasp.pdf",
    url = "https://link.springer.com/chapter/10.1007/978-3-319-75178-8_42",
    } 

  4. M. Torquati, T. Menga, T. De Matteis, D. De Sensi, and G. Mencagli, “Reducing message latency and cpu utilization in the caf actor framework,” in 2018 26th euromicro international conference on parallel, distributed and network-based processing (pdp), 2018, pp. 145-153. doi:10.1109/PDP2018.2018.00028
    [Abstract] [BibTeX] [URL] [Download PDF]
    @INPROCEEDINGS{cafpdp18,
    abstract = {In this work, we consider the C++ Actor Framework (CAF), a recent proposal that revamped the interest in building concurrent and distributed applicaions using the actor programming model in C++. CAF has been optimized for high-throughput computing, whereas message latency between actors is greatly influenced by the message data rate: at low and moderate rates the latency is higher than at high data rates. To this end, we propose a modification of the polling strategies in the work-stealing CAF scheduler, which can reduce message latency at low and moderate data rates up to two orders of magnitude without compromising the overall throughput and message latency at maximum pressure. The technique proposed uses a lightweight event notification protocol that is general enough to be used used to optimize the runtime of other frameworks experiencing similar issues.},
    author={Torquati, Massimo and Menga, Tullio and De Matteis, Tiziano and De Sensi, Daniele and Mencagli, Gabriele},
    booktitle={2018 26th Euromicro International Conference on Parallel, Distributed and Network-based Processing (PDP)},
    title={Reducing Message Latency and CPU Utilization in the CAF Actor Framework},
    year={2018},
    volume={},
    number={},
    pages={145-153},
    keywords={C++ languages;Computational modeling;Message systems;Power demand;Programming;Runtime;Throughput;Actor model;CAF;message latency;multi-cores;polling strategies;work-stealing},
    doi={10.1109/PDP2018.2018.00028},
    ISSN={},
    month={March},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2018_PDP.pdf},
    url = {https://ieeexplore.ieee.org/abstract/document/8374451/},
    } 

2017

  1. M. Danelutto, T. De Matteis, D. De Sensi, G. Mencagli, M. Torquati, M. Aldinucci, and P. Kilpatrick, “The rephrase extended pattern set for data intensive parallel computing,” International journal of parallel programming, p. 74–93, 2017. doi:10.1007/s10766-017-0540-z
    [Abstract] [BibTeX] [URL]
    @Article{rephrase:ijpp17,
    author="Danelutto, Marco
    and De Matteis, Tiziano
    and De Sensi, Daniele
    and Mencagli, Gabriele
    and Torquati, Massimo
    and Aldinucci, Marco
    and Kilpatrick, Peter",
    title="The RePhrase Extended Pattern Set for Data Intensive Parallel Computing",
    journal="International Journal of Parallel Programming",
    year="2017",
    month="Nov",
    day="28",
    abstract="We discuss the extended parallel pattern set identified within the EU-funded project RePhrase as a candidate pattern set to support data intensive applications targeting heterogeneous architectures. The set has been designed to include three classes of pattern, namely (1) core patterns, modelling common, not necessarily data intensive parallelism exploitation patterns, usually to be used in composition; (2) high level patterns, modelling common, complex and complete parallelism exploitation patterns; and (3) building block patterns, modelling the single components of data intensive applications, suitable for use---in composition---to implement patterns not covered by the core and high level patterns. We discuss the expressive power of the RePhrase extended pattern set and results illustrating the performances that may be achieved with the FastFlow implementation of the high level patterns.",
    issn="1573-7640",
    doi="10.1007/s10766-017-0540-z",
    openaccess="http://rdcu.be/zN6c",
    url="https://doi.org/10.1007/s10766-017-0540-z",
    pages="74–93",
    } 

  2. M. Danelutto, T. De Matteis, D. De Sensi, and M. Torquati, “Evaluating concurrency throttling and thread packing on smt multicores,” in Proceedings of the 25th euromicro international conference on parallel, distributed, and network-based processing, PDP 2017, 2017, pp. 219-223. doi:10.1109/PDP.2017.39
    [Abstract] [BibTeX] [URL] [Download PDF] [Slides]
    @inproceedings{cttp:pdp17,
    author = {Danelutto, Marco and De Matteis, Tiziano and De Sensi, Daniele and Torquati, Massimo},
    title = {Evaluating Concurrency Throttling and Thread Packing on SMT Multicores},
    booktitle = {Proceedings of the 25th Euromicro International Conference on Parallel, Distributed,
    and Network-Based Processing, {PDP} 2017},
    location = {St. Petersburg, Russia},
    year = {2017},
    doi={10.1109/PDP.2017.39},
    pages={219-223},
    abstract = {Power-aware computing is gaining an increasing attention both in academic and industrial settings. The problem of guaranteeing a given
    QoS requirement (either in terms of performance or power consumption) can be faced by selecting and dynamically adapting the amount of physical
    and logical resources used by the application. In this study, we considered standard multicore platforms by taking as a reference approaches for power-aware
    computing two well-known dynamic reconfiguration techniques: Concurrency Throttling and Thread Packing. Furthermore, we also studied the impact of using simultaneous
    multithreading (e.g., Intel’s HyperThreading) in both techniques. In this work, leveraging on the applications of the PARSEC benchmark suite, we evaluate these
    techniques by considering performance-power trade-offs, resource efficiency, predictability and required programming effort. The results show that, according to the
    comparison criteria, these techniques complement each other.},
    slides = {https://docs.google.com/presentation/d/1qdiJIPpQ19rzgifHwoHYlwfQ0RurBz8xdvSz5zG0xco/edit?usp=sharing},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2017_PDP.pdf},
    url = {http://ieeexplore.ieee.org/document/7912648/},
    } 

  3. M. Danelutto, T. De Matteis, D. De Sensi, G. Mencagli, and M. Torquati, “P$^{3}$arsec: towards parallel patterns benchmarking,” in Proceedings of the 32nd annual acm symposium on applied computing, New York, NY, USA, 2017, pp. 1582-1589. doi:10.1145/3019612.3019745
    [Abstract] [BibTeX] [URL] [Download PDF] [Slides]
    @inproceedings{p3arsec:sac17,
    author = {Danelutto, Marco and De Matteis, Tiziano and De Sensi, Daniele and Mencagli, Gabriele and Torquati, Massimo},
    title = {P$^{3}$ARSEC: Towards Parallel Patterns Benchmarking},
    isbn = {978-1-4503-4486-9},
    pages = {1582--1589},
    numpages = {8},
    doi = {10.1145/3019612.3019745},
    acmid = {3019745},
    booktitle = {Proceedings of the 32nd Annual ACM Symposium on Applied Computing},
    series = {SAC '17},
    year = {2017},
    location = {Marrakesh, Morocco},
    numpages = {8},
    publisher = {ACM},
    address = {New York, NY, USA},
    keywords = {Parallel Patterns, PARSEC Benchmarks, Intel KNL},
    abstract = {High-level parallel programming is a de-facto standard approach to develop parallel software with reduced time to development. High-level abstractions are provided by existing frameworks as pragma-based annotations in the source code, or through pre-built parallel patterns that recur frequently in parallel algorithms, and that can be easily instantiated by the programmer to add a structure to the development of parallel software. In this paper we focus on this second approach and we propose P3ARSEC, a benchmark suite for parallel pattern-based frameworks consisting of a representative subset of PARSEC applications. We analyse the programmability advantages and the potential performance penalty of using such high-level methodology with respect to hand-made parallelisations using low-level mechanisms. The results are obtained on the new Intel Knights Landing multicore, and show a significantly reduced code complexity with comparable performance.},
    slides = {https://docs.google.com/presentation/d/1tbGK13EGookcV1HvVbup2Rx1HlH65t4tsbhIuaoS3tA/edit#slide=id.g1b7a7fa945_0_14},
    url = {http://doi.acm.org/10.1145/3019612.3019745},
    openaccess = {http://dl.acm.org/authorize?N34889},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2017_SAC.pdf},
    } 

  4. T. De Matteis and G. Mencagli, “Proactive elasticity and energy awareness in data stream processing,” Journal of systems and software, vol. 127, pp. 302-319, 2017. doi:http://dx.doi.org/10.1016/j.jss.2016.08.037
    [Abstract] [BibTeX] [URL]
    @article{dasp:jss17,
    title = "Proactive elasticity and energy awareness in data stream processing ",
    journal = "Journal of Systems and Software ",
    volume = "127",
    number = "",
    pages = "302 - 319",
    year = "2017",
    note = "",
    issn = "0164-1212",
    doi = "http://dx.doi.org/10.1016/j.jss.2016.08.037",
    url = "http://www.sciencedirect.com/science/article/pii/S0164121216301467",
    author = "De Matteis, Tiziano and Mencagli, Gabriele",
    keywords = "Data stream processing",
    keywords = "Elasticity",
    keywords = "Model predictive control",
    keywords = "Frequency scaling ",
    abstract = "Abstract Data stream processing applications have a long running nature (24 hr/7 d) with workload conditions that may exhibit wide variations at run-time. Elasticity is the term coined to describe the capability of applications to change dynamically their resource usage in response to workload fluctuations. This paper focuses on strategies for elastic data stream processing targeting multicore systems. The key idea is to exploit Model Predictive Control, a control-theoretic method that takes into account the system behavior over a future time horizon in order to decide the best reconfiguration to execute. We design a set of energy-aware proactive strategies, optimized for throughput and latency QoS requirements, which regulate the number of used cores and the \{CPU\} frequency through the Dynamic Voltage and Frequency Scaling (DVFS) support offered by modern multicore CPUs. We evaluate our strategies in a high-frequency trading application fed by synthetic and real-world workload traces. We introduce specific properties to effectively compare different elastic approaches, and the results show that our strategies are able to achieve the best outcome. "
    } 

  5. T. De Matteis and G. Mencagli, “Elastic scaling for distributed latency-sensitive data stream operators,” in Proceedings of the 25th euromicro international conference on parallel, distributed, and network-based processing, PDP 2017, 2017.
    [Abstract] [BibTeX] [Slides]
    @inproceedings{dasp:pdp17,
    author = {De Matteis, Tiziano and Mencagli, Gabriele},
    title = {Elastic Scaling for Distributed Latency-sensitive Data Stream Operators},
    booktitle = {Proceedings of the 25th Euromicro International Conference on Parallel, Distributed,
    and Network-Based Processing, {PDP} 2017},
    location = {St. Petersburg, Russia},
    year = {2017},
    abstract = {High-volume data streams are straining the limits of stream processing frameworks which need advanced parallel processing capabilities to withstand the actual incoming bandwidth.
    Parallel processing must be synergically integrated with elastic features in order dynamically scale the amount of utilized resources by accomplishing the Quality of Service goals in a costeffective
    manner. This paper proposes a control-theoretic strategy to drive the elastic behavior of latency-sensitive streaming operators in distributed environments. The strategy takes scaling
    decisions in advance by relying on a predictive model-based approach. Our ideas have been experimentally evaluated on a cluster using a real-world streaming application fed by synthetic
    and real datasets. The results show that our approach takes the strictly necessary reconfigurations while providing reduced resource consumption. Furthermore, it allows the operator to
    meet desired average latency requirements with a significant reduction in the experienced latency jitter.},
    slides = {https://docs.google.com/presentation/d/1QwB0-7STgB6BF9q_GPJBf1lYjuiQ9FCPmvg-xWIJAGI/edit?usp=sharing}
    } 

  6. T. De Matteis and G. Mencagli, “Parallel patterns for window-based stateful operators on data streams: an algorithmic skeleton approach,” International journal of parallel programming, vol. 45, iss. 2, pp. 382-401, 2017. doi:10.1007/s10766-016-0413-x
    [Abstract] [BibTeX] [URL] [Slides]
    @Article{DeMatteis2017,
    author="De Matteis, Tiziano
    and Mencagli, Gabriele",
    title="Parallel Patterns for Window-Based Stateful Operators on Data Streams: An Algorithmic Skeleton Approach",
    journal="International Journal of Parallel Programming",
    year="2017",
    month="Apr",
    day="01",
    volume="45",
    number="2",
    pages="382--401",
    abstract="The topic of Data Stream Processing is a recent and highly active research area dealing with the in-memory, tuple-by-tuple analysis of streaming data. Continuous queries typically consume huge volumes of data received at a great velocity. Solutions that persistently store all the input tuples and then perform off-line computation are impractical. Rather, queries must be executed continuously as data cross the streams. The goal of this paper is to present parallel patterns for window-based stateful operators, which are the most representative class of stateful data stream operators. Parallel patterns are presented ``{\`a} la'' Algorithmic Skeleton, by explaining the rationale of each pattern, the preconditions to safely apply it, and the outcome in terms of throughput, latency and memory consumption. The patterns have been implemented in the {\$}{\$}{\backslash}mathtt {\{}FastFlow{\}}{\$}{\$} FastFlow framework targeting off-the-shelf multicores. To the best of our knowledge this is the first time that a similar effort to merge the Data Stream Processing domain and the field of Structured Parallelism has been made.",
    issn="1573-7640",
    doi="10.1007/s10766-016-0413-x",
    url="https://doi.org/10.1007/s10766-016-0413-x",
    slides = {https://docs.google.com/presentation/d/1yhsSff97f434wR-VA1szlqKxx52YMYKkdw1GVkBDyF8/edit?usp=sharing},
    } 

  7. D. De Sensi, T. De Matteis, M. Torquati, G. Mencagli, and M. Danelutto, “Bringing parallel patterns out of the corner: the p$^{3}$arsec benchmark suite,” Acm trans. archit. code optim., vol. 14, iss. 4, p. 33:1–33:26, 2017. doi:10.1145/3132710
    [Abstract] [BibTeX] [URL] [Download PDF]
    @article{p3arsec:taco17,
    author = {De Sensi, Daniele and De Matteis, Tiziano and Torquati, Massimo and Mencagli, Gabriele and Danelutto, Marco},
    title = {Bringing Parallel Patterns Out of the Corner: The P$^{3}$ARSEC Benchmark Suite},
    journal = {ACM Trans. Archit. Code Optim.},
    issue_date = {October 2017},
    volume = {14},
    number = {4},
    month = oct,
    year = {2017},
    issn = {1544-3566},
    pages = {33:1--33:26},
    articleno = {33},
    numpages = {26},
    url = {http://doi.acm.org/10.1145/3132710},
    openaccess = {http://dl.acm.org/authorize?N49996},
    pdf = {http://pages.di.unipi.it/desensi/assets/pdf/2017_TACO.pdf},
    doi = {10.1145/3132710},
    acmid = {3132710},
    publisher = {ACM},
    address = {New York, NY, USA},
    keywords = {Parallel patterns, algorithmic skeletons, benchmarking, multicore programming, parsec},
    poster = {http://pages.di.unipi.it/desensi/assets/img/2017_TACO.png},
    abstract = {High-level parallel programming is an active research topic aimed at promoting parallel programming methodologies that provide the programmer with high-level abstractions to develop complex parallel software with reduced time to solution. Pattern-based parallel programming is based on a set of composable and customizable parallel patterns used as basic building blocks in parallel applications. In recent years, a considerable effort has been made in empowering this programming model with features able to overcome shortcomings of early approaches concerning flexibility and performance. In this article, we demonstrate that the approach is flexible and efficient enough by applying it on 12 out of 13 PARSEC applications. Our analysis, conducted on three different multicore architectures, demonstrates that pattern-based parallel programming has reached a good level of maturity, providing comparable results in terms of performance with respect to both other parallel programming methodologies based on pragma-based annotations (i.e., Openmp and OmpSs) and native implementations (i.e., Pthreads). Regarding the programming effort, we also demonstrate a considerable reduction in lines of code and code churn compared to Pthreads and comparable results with respect to other existing implementations.},
    } 

  8. G. Mencagli, M. Torquati, M. Danelutto, and T. De Matteis, “Parallel continuous preference queries over out-of-order and bursty data streams,” Ieee transactions on parallel and distributed systems, vol. 28, iss. 9, pp. 2608-2624, 2017. doi:10.1109/TPDS.2017.2679197
    [Abstract] [BibTeX]
    @ARTICLE{tpds17,
    author={Mencagli, Gabriele and Torquati, Massimo and Danelutto, Marco and De Matteis, Tiziano},
    journal={IEEE Transactions on Parallel and Distributed Systems},
    title={Parallel Continuous Preference Queries over Out-of-Order and Bursty Data Streams},
    year={2017},
    volume={28},
    number={9},
    pages={2608-2624},
    abstract={Techniques to handle traffic bursts and out-of-order arrivals are of paramount importance to provide real-time sensor data analytics in domains like traffic surveillance, transportation management, healthcare and security applications. In these systems the amount of raw data coming from sensors must be analyzed by continuous queries that extract value-added information used to make informed decisions in real-time. To perform this task with timing constraints, parallelism must be exploited in the query execution in order to enable the real-time processing on parallel architectures. In this paper we focus on continuous preference queries, a representative class of continuous queries for decision making, and we propose a parallel query model targeting the efficient processing over out-of-order and bursty data streams. We study how to integrate punctuation mechanisms in order to enable out-of-order processing. Then, we present advanced scheduling strategies targeting scenarios with different burstiness levels, parameterized using the index of dispersion quantity. Extensive experiments have been performed using synthetic datasets and real-world data streams obtained from an existing real-time locating system. The experimental evaluation demonstrates the efficiency of our parallel solution and its effectiveness in handling the out-of-orderness degrees and burstiness levels of real-world applications.},
    keywords={data analysis;parallel architectures;query processing;scheduling;bursty data streams;data analytics;decision making;out-of-order arrivals;out-of-order data streams;parallel architectures;parallel continuous preference queries;parallel query model;scheduling strategies;traffic bursts;Computational modeling;Data models;Multicore processing;Out of order;Parallel processing;Real-time systems;Parallelism;burstiness and traffic surges;continuous preference queries;data streams;multicores;out-of-order arrivals;sliding windows},
    doi={10.1109/TPDS.2017.2679197},
    ISSN={1045-9219},
    month={Sept}} 

  9. M. Torquati, G. Mencagli, M. Drocco, M. Aldinucci, T. De Matteis, and M. Danelutto, “On dynamic memory allocation in sliding-window parallel patterns for streaming analytics,” The journal of supercomputing, 2017. doi:10.1007/s11227-017-2152-1
    [Abstract] [BibTeX] [URL]
    @Article{jsc17,
    author="Torquati, M. and Mencagli, G. and Drocco, M. and Aldinucci, M. and De Matteis, T. and Danelutto, M.",
    title="On dynamic memory allocation in sliding-window parallel patterns for streaming analytics",
    journal="The Journal of Supercomputing",
    year="2017",
    month="Sep",
    day="27",
    abstract="This work studies the issues related to dynamic memory management in Data Stream Processing, an emerging paradigm enabling the real-time processing of live data streams. In this paper, we consider two streaming parallel patterns and we discuss different implementation variants related to how dynamic memory is managed. The results show that the standard mechanisms provided by modern C++ are not entirely adequate for maximizing the performance. Instead, the combined use of an efficient general purpose memory allocator, a custom allocator optimized for the pattern considered and a custom variant of the C++ shared pointer mechanism, provides a performance improvement up to 16{\%} on the best case.",
    issn="1573-0484",
    doi="10.1007/s11227-017-2152-1",
    url="https://doi.org/10.1007/s11227-017-2152-1"
    } 

2016

  1. M. Danelutto, T. De Matteis, G. Mencagli, and M. Torquati, “Data stream processing via code annotations,” The journal of supercomputing, pp. 1-15, 2016. doi:10.1007/s11227-016-1793-9
    [Abstract] [BibTeX] [URL]
    @Article{js2016,
    author="Danelutto, Marco and De Matteis, Tiziano and Mencagli, Gabriele and Torquati, Massimo",
    title="Data stream processing via code annotations",
    journal="The Journal of Supercomputing",
    year="2016",
    pages="1--15",
    abstract="Time-to-solution is an important metric when parallelizing existing code. The REPARA approach provides a systematic way to instantiate stream and data parallel patterns by annotating the sequential source code with C++11 attributes. Annotations are automatically transformed in a target
    parallel code that uses existing libraries for parallel programming (e.g., FastFlow). In this paper, we apply this approach for the parallelization of a data stream processing application.
    The description shows the effectiveness of the approach in easily and quickly prototyping several parallel variants of the sequential code by obtaining good overall performance in terms of both throughput and latency.",
    issn="1573-0484",
    doi="10.1007/s11227-016-1793-9",
    url="http://dx.doi.org/10.1007/s11227-016-1793-9"
    } 

  2. M. Danelutto, T. De Matteis, G. Mencagli, and M. Torquati, “A divide-and-conquer parallel pattern implementation for multicores,” in Proceedings of the 3rd international workshop on software engineering for parallel systems, New York, NY, USA, 2016, pp. 10-19. doi:10.1145/3002125.3002128
    [BibTeX] [URL]
    @inproceedings{DAC:2016:SEPS,
    author = {Danelutto, Marco and De Matteis, Tiziano and Mencagli, Gabriele and Torquati, Massimo},
    title = {A Divide-and-conquer Parallel Pattern Implementation for Multicores},
    booktitle = {Proceedings of the 3rd International Workshop on Software Engineering for Parallel Systems},
    series = {SEPS 2016},
    year = {2016},
    isbn = {978-1-4503-4641-2},
    location = {Amsterdam, Netherlands},
    pages = {10--19},
    numpages = {10},
    url = {http://doi.acm.org/10.1145/3002125.3002128},
    doi = {10.1145/3002125.3002128},
    acmid = {3002128},
    publisher = {ACM},
    address = {New York, NY, USA},
    keywords = {Divide and Conquer, High-level parallel patterns},
    } 

  3. T. De Matteis, “Parallel patterns for adaptive data stream processing,” PhD Thesis, 2016.
    [BibTeX] [URL]
    @PhdThesis{dematteis_phd16,
    author = {De Matteis, Tiziano},
    title = {Parallel Patterns for Adaptive Data Stream Processing},
    school = {University of Pisa},
    year = {2016},
    url ={https://etd.adm.unipi.it/theses/available/etd-09152016-145603/unrestricted/PhdThesis_DeMatteis.pdf}
    } 

  4. T. De Matteis, S. Di Girolamo, and G. Mencagli, “Continuous skyline queries on multicore architectures,” Concurrency and computation: practice and experience, vol. 28, iss. 12, pp. 3503-3522, 2016. doi:10.1002/cpe.3866
    [Abstract] [BibTeX] [URL]
    @article{ccpe2016,
    author = {De Matteis, Tiziano and Di Girolamo, Salvatore and Mencagli, Gabriele},
    title = { Continuous Skyline Queries on Multicore Architectures},
    year={2016},
    volume={28},
    number={12},
    journal = {Concurrency and Computation: Practice and Experience},
    pages = {3503--3522},
    issn = {1532-0634},
    url = {http://dx.doi.org/10.1002/cpe.3866},
    doi = {10.1002/cpe.3866},
    abstract={The emergence of real-time decision-making applications in domains like high-frequency trading,
    emergency management and service level analysis in communication networks, has led to the definition
    of new classes of queries. Skyline queries are a notable example. Their results consist of all the tuples whose
    attribute vector is not dominated (in the Pareto sense) by one of any other tuple. Because of their popularity,
    skyline queries have been studied in terms of both sequential algorithms and parallel implementations for
    multiprocessors and clusters. Within the Data Stream Processing paradigm, traditional database queries
    on static relations have been revised in order to operate on continuous data streams. Most of the past
    papers propose sequential algorithms for continuous skyline queries, whereas there exist very few works
    targeting implementations on parallel machines. This paper contributes to fill this gap by proposing a parallel
    implementation for multicore architectures. We propose: i) a parallelization of the eager algorithm based on
    the notion of Skyline Influence Time, ii) optimizations of the reduce phase and load-balancing strategies to
    achieve near-optimal speedup, iii) a set of experiments with both synthetic benchmarks and a real dataset in
    order to show our implementation effectiveness}
    } 

  5. T. De Matteis and G. Mencagli, “Keep calm and react with foresight: strategies for low-latency and energy-efficient elastic data stream processing,” in Proceedings of the 21st acm sigplan symposium on principles and practice of parallel programming (ppopp), 2016, p. 13:1–13:12. doi:10.1145/2851141.2851148
    [Abstract] [BibTeX] [URL] [Download PDF] [Slides]
    @InProceedings{ppopp2016,
    author = {De Matteis,Tiziano and Mencagli,Gabriele},
    title = { Keep Calm and React with Foresight: Strategies for Low-Latency and Energy-Efficient Elastic Data Stream Processing},
    booktitle = {Proceedings of the 21st ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)},
    year = {2016},
    pages = {13:1--13:12},
    abstract = {This paper addresses the problem of designing control strategies for elastic stream processing applications. Elasticity allows applications to rapidly change their configuration (e.g. the number of used resources) on-the-fly, in response to fluctuations of their workload. In this work we face this problem by adopting the Model Predictive Control technique, a control-theoretic method aimed at finding the optimal application configuration along a limited prediction horizon by solving an online optimization problem. Our control strategies are designed to address latency constraints, by using Queueing Theory models, and energy consumption by changing the number of used cores and the CPU frequency through the Dynamic Voltage and Frequency Scaling (DVFS) function of modern multi-core CPUs. The proactive capabilities, in addition to the latency- and energy-awareness, represent the novel features of our approach. Experiments performed using a high-frequency trading application show the effectiveness compared with state-of-the-art techniques.},
    articleno = {13},
    awards = {The paper has passed the Artifact Evaluation},
    doi = {10.1145/2851141.2851148},
    isbn = {978-1-4503-4092-2},
    location = {Barcelona, Spain},
    numpages = {12},
    slides = {https://docs.google.com/presentation/d/1VZ3y3RQDLFi_xA7Rl0Vj1iqBdoerxCMG4y53uMz9Ziw/edit?usp=sharing},
    url = {http://doi.acm.org/10.1145/2851141.2851148}
    } 

2015

  1. M. Danelutto, T. De Matteis, G. Mencagli, and M. Torquati, “Parallelizing high-frequency trading applications by using C++11 attributes,” in Proc. of intl. workshop on reengineering for parallelism in heterogeneous parallel platforms (repara), Helsinki, Finland, 2015, pp. 140-147. doi:10.1109/Trustcom.2015.623
    [Abstract] [BibTeX]
    @inproceedings{repara_ispa_15,
    Abstract = { With the wide diffusion of parallel architectures parallelism has become an indispensable factor in the application design. However, the cost of the parallelization process of existing applications is still too high in terms of time-to-development, and often requires a large effort and expertise by the programmer. The REPARA methodology consists in a systematic way to express parallel patterns by annotating the source code using C+ +11 attributes transformed automatically in a target parallel code based on parallel programming libraries (e.g. FastFlow, Intel TBB). In this paper we apply this approach in the parallelization of a real high-frequency trading application. The description shows the effectiveness of the approach in easily prototyping several parallel variants of the same code. We also propose an extension of a REPARA attribute to express a user-defined scheduling strategy, which makes it possible to design a high-throughput and low-latency parallelization of our code outperforming the other parallel variants in most of the considered test-cases.},
    Address = {Helsinki, Finland},
    Author = {Danelutto, Marco and De Matteis, Tiziano and Mencagli, Gabriele and Torquati, Massimo},
    Booktitle = {Proc. of Intl. Workshop on Reengineering for Parallelism in Heterogeneous Parallel Platforms (RePara)},
    Doi = {10.1109/Trustcom.2015.623},
    Keywords = {fastflow, repara},
    Month = aug,
    Pages = {140-147},
    Publisher = {IEEE},
    Title = {Parallelizing High-Frequency Trading Applications by using {C++11} Attributes},
    Year = {2015}
    } 

  2. T. De Matteis, S. Di Girolamo, and G. Mencagli, “A multicore parallelization of continuous skyline queries on data streams,” in Proceedings of the 2015 international conference on parallel processing (euro-par), Vienna, Austria, 2015, pp. 402-413. doi:10.1007/978-3-662-48096-0_31
    [Abstract] [BibTeX] [Slides]
    @InProceedings{europar2015,
    author = {De Matteis,Tiziano and Di Girolamo, Salvatore and Mencagli,Gabriele},
    title = {A Multicore Parallelization of Continuous Skyline Queries on Data Streams},
    booktitle = { Proceedings of the 2015 International Conference on Parallel Processing (Euro-Par)},
    year = {2015},
    pages = {402--413},
    address = {Vienna, Austria},
    abstract = {Skyline queries are a relevant example of preference queries frequently used in multi-criteria decision making to retrieve interesting points from large datasets. They return the points whose attribute vector is not dominated by any other point. Due to their importance in real-time scenarios, skyline queries have been studied both in terms of sequential algorithms and parallel implementations for multiprocessors and clusters. Recently, with the advent of the Data Stream Processing paradigm, skyline queries have been computed over continuous data streams according to the sliding window model. Although sequential algorithms have been proposed for continuous skyline queries, few works targeting modern parallel architectures exist. This paper contributes to the current
    literature by proposing a parallel implementation on multicores. We provide a description of our parallelization by focusing on the cooperation pattern between parallel functionalities, optimizations related to the reduce phase, and load-balancing strategies. Finally, we show experiments using different point distributions, arrival rates and window lengths.},
    doi = {10.1007/978-3-662-48096-0_31},
    isbn = {978-3-662-48095-3},
    slides = {https://docs.google.com/presentation/d/1JQVn9QnLC15e_MhmNOttP3mohray_sulAO532PJqOy4/edit?usp=sharing}
    } 

2014

  1. D. Buono, T. De Matteis, and G. Mencagli, “A high-throughput and low-latency parallelization of window-based stream joins on multicores,” in 12th ieee international symposium on parallel and distributed processing with applications, Milano, Italy, 2014, pp. 117-126. doi:10.1109/ISPA.2014.24
    [BibTeX] [URL]
    @INPROCEEDINGS{ispa2014,
    author = {Buono,Daniele and De Matteis,Tiziano and Mencagli,Gabriele},
    booktitle={12th IEEE International Symposium on Parallel and Distributed Processing with Applications},
    title={A High-Throughput and Low-Latency Parallelization of Window-based Stream Joins on Multicores},
    year = {2014},
    isbn = {978-1-4799-4293-0},
    pages = {117--126},
    numpages = {10},
    url = {http://dx.doi.org/10.1109/ISPA.2014.24},
    doi = {10.1109/ISPA.2014.24},
    acmid = {2681942},
    publisher = {IEEE Computer Society},
    address={Milano, Italy}
    } 

  2. D. Buono, M. Danelutto, T. De Matteis, G. Mencagli, and M. Torquati, “A lightweight run-time support for fast dense linear algebra on multi-core,” in Proc. of the 12th international conference on parallel and distributed computing and networks (pdcn 2014), 2014.
    [BibTeX]
    @inproceedings{ff:ffmdf:pdcn:14,
    Author = {Buono, Daniele and Danelutto, Marco and De Matteis, Tiziano and Mencagli, Gabriele and Torquati, Massimo},
    Booktitle = {Proc. of the 12th International Conference on Parallel and Distributed Computing and Networks (PDCN 2014)},
    Date-Modified = {2015-02-01 16:49:46 +0000},
    Keywords = {fastflow},
    Month = feb,
    Publisher = {IASTED, ACTA press},
    Title = {A Lightweight Run-Time Support For Fast Dense Linear Algebra on Multi-Core},
    Year = {2014}} 

  3. D. Buono, T. De Matteis, G. Mencagli, and M. Vanneschi, “Optimizing message-passing on multicore architectures using hardware multi-threading,” in Parallel, distributed and network-based processing (pdp), 2014 22nd euromicro international conference on, Torino, Italy, 2014, pp. 262-270. doi:10.1109/PDP.2014.63
    [Abstract] [BibTeX]
    @InProceedings{pdp2014,
    author = {Buono,Daniele and De Matteis,Tiziano and Mencagli,Gabriele and Vanneschi,Marco},
    title = {Optimizing Message-Passing on Multicore Architectures Using Hardware Multi-threading},
    booktitle = {Parallel, Distributed and Network-Based Processing (PDP), 2014 22nd Euromicro International Conference on},
    year = {2014},
    pages = {262-270},
    address = {Torino, Italy},
    abstract = {Shared-memory and message-passing are two opposite models to develop parallel computations. The shared-
    memory model, adopted by existing frameworks such as OpenMP,represents a de-facto standard on multi-/many-core architectures. However, message-passing deserves to be studied for its inherent properties in terms of portability and flexibility as well as for its better ease of debugging. Achieving good performance from the use of messages in shared-memory architectures requires an efficient implementation of the run-time support. This paper investigates the definition of a delegation mechanism on multi-threaded architectures able to: (i) overlap communications with calculation phases; (ii) parallelize distribution and collective operations. Our ideas have been exemplified using two parallel benchmarks on the Intel Phi, showing that in these applications our message-passing support outperforms MPI and reaches similar
    performance compared to standard OpenMP implementations.},
    doi = {10.1109/PDP.2014.63},
    issn = {1066-6192}
    } 

  4. T. De Matteis, “Autonomic parallel data stream processing,” in High performance computing simulation (hpcs), 2014 international conference on, Bologna, Italy, 2014, pp. 995-998. doi:10.1109/HPCSim.2014.6903797
    [BibTeX]
    @INPROCEEDINGS{hpcs2014,
    author = {De Matteis,Tiziano},
    booktitle={High Performance Computing Simulation (HPCS), 2014 International Conference on},
    title={Autonomic Parallel Data Stream Processing},
    title={Autonomic parallel Data Stream Processing},
    year={2014},
    month={July},
    pages={995-998},
    address={Bologna, Italy},
    doi={10.1109/HPCSim.2014.6903797}
    } 

2013

  1. T. De Matteis, F. Luporini, G. Mencagli, and M. Vanneschi, “Evaluation of architectural supports for fine-grained synchronization mechanisms,” in Proceedings of the 11th iasted international conference on parallel and distributed computing and networks, Innsbruck, Austria, 2013.
    [Abstract] [BibTeX]
    @InProceedings{pdcn2013,
    author = {De Matteis, Tiziano and Luporini, Fabio and Mencagli, Gabriele and Vanneschi, Marco},
    title = {Evaluation of Architectural Supports for Fine-Grained Synchronization Mechanisms},
    booktitle = {Proceedings of the 11th IASTED International Conference on Parallel and Distributed Computing and Networks},
    year = {2013},
    address = {Innsbruck, Austria},
    publisher = {Iasted},
    abstract = {The advent of multi-/many-core architectures demands efficient run-time supports to sustain parallel applications scalability. Synchronization mechanisms should be optimized in order to account for different scenarios, such as the interaction between threads executed on different cores as well as intra-core synchronization, i.e. involving threads executed on hardware contexts of the same core. In this perspective, we describe the design issues of two notable mechanisms for shared-memory parallel computations. We point out how specific architectural supports, like hardware cache coherence and core-to-core interconnection networks, make it possible to design optimized implementations of such mechanisms. In this paper we discuss experimental results on three representative architectures: a flagship Intel multi-core and two interesting network processors. The final result helps to untangle the complex implementation space of synchronization mechanisms.},
    isbn = {978-088986943-1}
    }