Filgueras, A; Vidal, M; Mateu, M; Jiménez-González, D; Álvarez, C; Martorell, X; Ayguadé, E; Theodoropoulos, D; Pnevmatikatos, D; Gai, P; Garzarella, S; Oro, D; Hernando, J; Bettin, N; Pomella, A; Procaccini, M; Giorgi, R
The AXIOM Project: IoT on Heterogeneous Embedded Platforms Journal Article
In: IEEE Design and Test, vol. pre-print, pp. 1-6, 2019, ISSN: 2168-2356.
@article{Filgueras19-ieee_dnt,
title = {The AXIOM Project: IoT on Heterogeneous Embedded Platforms},
author = {A Filgueras and M Vidal and M Mateu and D Jim\'{e}nez-Gonz\'{a}lez and C \'{A}lvarez and X Martorell and E Ayguad\'{e} and D Theodoropoulos and D Pnevmatikatos and P Gai and S Garzarella and D Oro and J Hernando and N Bettin and A Pomella and M Procaccini and R Giorgi},
doi = {10.1109/MDAT.2019.2952335},
issn = {2168-2356},
year = {2019},
date = {2019-11-01},
journal = {IEEE Design and Test},
volume = {pre-print},
pages = {1-6},
abstract = {The AXIOM project aims at providing an environment for Cyber-Physical Systems. Smart Video Surveillance targets public environments, involving real-time face detection in crowds. Smart Home Living targets home environments and access control. These applications are used as experimental usecases for the AXIOM platform, currently based on the Xilinx Zynq-7000 SoCs. We have integrated the Xilinx Vivado HLS tool for the FPGA support within the OmpSs programming model, to enable OpenMP-like programming in the FPGA. This paper presents the programming environment, and the evaluation of the most computationally expensive parts of the target applications.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Translating Timing into an Architecture: the Synergy of COTSon and HLS (Domain Expertise: Designing a Computer Architecture via HLS) Journal Article
In: pp. 1–18, 2019, ISSN: 1687-7209.
@article{Giorgi19-ijrc,
title = {Translating Timing into an Architecture: the Synergy of COTSon and HLS (Domain Expertise: Designing a Computer Architecture via HLS)},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1155/2019/2624938},
issn = {1687-7209},
year = {2019},
date = {2019-09-01},
booktitle = {International Journal of Reconfigurable Computing},
pages = {1--18},
address = {London, UK},
abstract = {Translating a system requirement into a low-level representation (e.g., register transfer level or RTL) is the typical goal of the design of FPGA based systems. However, the Design Space Exploration (DSE) needed to identify the final architecture may be time consuming, even when using High Level Synthesis (HLS) tools.
In this paper, we illustrate our hybrid methodology, which uses a frontend to HLS so that the DSE is performed more rapidly by using a higher-level abstraction, but without loosing accuracy, thanks to the HP-Labs COTSon simulation infrastructure in combination with our DSE tools (MYDSE tools). In particular, this proposed methodology proved useful to achieve appropriate design of a whole system in a shorter time than trying to design everything directly in HLS.
Our motivating problem was to deploy a novel execution model called Data-Flow Threads (DF-Threads) running on yet to be designed hardware. For that goal, directly using the HLS was too premature in the design cycle. Therefore, a key point of our methodology consists in defining the first prototype in our simulation framework and gradually migrating the design into the Xilinx HLS after validating the key performance metrics of our novel system in the simulator.
To explain this workflow, we first use a simple driving example consisting in the modelling of a two-way associative cache. Then, we explain how we generalized this methodology and describe the types of results that we were able to analyze in the AXIOM project, that helped us reduce the development time from months/weeks to days/hours.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
In this paper, we illustrate our hybrid methodology, which uses a frontend to HLS so that the DSE is performed more rapidly by using a higher-level abstraction, but without loosing accuracy, thanks to the HP-Labs COTSon simulation infrastructure in combination with our DSE tools (MYDSE tools). In particular, this proposed methodology proved useful to achieve appropriate design of a whole system in a shorter time than trying to design everything directly in HLS.
Our motivating problem was to deploy a novel execution model called Data-Flow Threads (DF-Threads) running on yet to be designed hardware. For that goal, directly using the HLS was too premature in the design cycle. Therefore, a key point of our methodology consists in defining the first prototype in our simulation framework and gradually migrating the design into the Xilinx HLS after validating the key performance metrics of our novel system in the simulator.
To explain this workflow, we first use a simple driving example consisting in the modelling of a two-way associative cache. Then, we explain how we generalized this methodology and describe the types of results that we were able to analyze in the AXIOM project, that helped us reduce the development time from months/weeks to days/hours.
Giorgi, Roberto; Procaccini, Marco
Bridging a Data-Flow Execution Model to a Simple Programming Model Proceedings Article
In: IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS), pp. 165-168, Dublin, Ireland, 2019, ISBN: 978-1-7281-4484-9.
@inproceedings{Giorgi19-hpcsb,
title = {Bridging a Data-Flow Execution Model to a Simple Programming Model},
author = {Roberto Giorgi and Marco Procaccini},
doi = {10.1109/HPCS48598.2019.9188183},
isbn = {978-1-7281-4484-9},
year = {2019},
date = {2019-07-01},
booktitle = {IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS)},
pages = {165-168},
address = {Dublin, Ireland},
abstract = {Starting from a Data-Flow execution model called ``DF-Threads'',
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.
Giorgi, Roberto; Procaccini, Marco
Bridging a Data-Flow Execution Model to a Simple Programming Model Proceedings Article
In: IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS), pp. 165-168, Dublin, Ireland, 2019, ISBN: 978-1-7281-4484-9.
@inproceedings{Giorgi19-hpcs,
title = {Bridging a Data-Flow Execution Model to a Simple Programming Model},
author = {Roberto Giorgi and Marco Procaccini},
doi = {10.1109/HPCS48598.2019.9188183},
isbn = {978-1-7281-4484-9},
year = {2019},
date = {2019-07-01},
booktitle = {IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS)},
pages = {165-168},
address = {Dublin, Ireland},
abstract = {Starting from a Data-Flow execution model called ``DF-Threads'',
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.
Roberto, Bettin Nicola Giorgi; Ermini, Sara; Montefoschi, Francesco; Rizzo, Antonio
An Iris+Voice Recognition Systemfor a Smart Doorbell Proceedings Article
In: IEEE 8th Mediterranean Conference on Embedded Computing (MECO), pp. 419-422, 2019, ISSN: 2377-5475.
@inproceedings{Giorgi19-mecoiris,
title = {An Iris+Voice Recognition Systemfor a Smart Doorbell},
author = {Bettin Nicola Giorgi Roberto and Sara Ermini and Francesco Montefoschi and Antonio Rizzo},
doi = {10.1109/MECO.2019.8760187},
issn = {2377-5475},
year = {2019},
date = {2019-06-01},
booktitle = {IEEE 8th Mediterranean Conference on Embedded Computing (MECO)},
pages = {419-422},
abstract = {In this paper, we describe our methodology for designing a smart doorbell system for the homes.
While the recent trend of big companies is to offer a home voice assistant, which can integrate
all possible services, including the recognition of the owner (or authorized people) at the house door,
privacy concerns and independence from a single service provider are requiring more freedom in the
choice of the ``smart objects'' that surround us.
The doorbell system is using both iris and voice recognition to verify the identity of the user
who rings at the door. Since there is the involvement of biometric data, this information has to be properly handled.
In particular, we designed our system in such a way that it can avoid to send or store any biometric data
to the cloud. Machine-learning algorithms are used to perform local computations, thus implementing
Edge-Computing analytics to determine the identity of the user, by combining both voice and iris biometrics.
The system is implemented on reconfigurable hardware in order to accelerate some of the most intensive
tasks and achieve enough performance at a reasonable power consumption.
Our tests confirm that, by using our architecture, the performance is about 5x the sequential case
and, at the same time, we reach about 7x less energy consumption.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
While the recent trend of big companies is to offer a home voice assistant, which can integrate
all possible services, including the recognition of the owner (or authorized people) at the house door,
privacy concerns and independence from a single service provider are requiring more freedom in the
choice of the ``smart objects'' that surround us.
The doorbell system is using both iris and voice recognition to verify the identity of the user
who rings at the door. Since there is the involvement of biometric data, this information has to be properly handled.
In particular, we designed our system in such a way that it can avoid to send or store any biometric data
to the cloud. Machine-learning algorithms are used to perform local computations, thus implementing
Edge-Computing analytics to determine the identity of the user, by combining both voice and iris biometrics.
The system is implemented on reconfigurable hardware in order to accelerate some of the most intensive
tasks and achieve enough performance at a reasonable power consumption.
Our tests confirm that, by using our architecture, the performance is about 5x the sequential case
and, at the same time, we reach about 7x less energy consumption.
Roberto, Oro David Giorgi; Ermini, Sara; Montefoschi, Francesco; Rizzo, Antonio
Embedded Face Analysis for Smart Videosurveillance Proceedings Article
In: IEEE 8th Mediterranean Conference on Embedded Computing (MECO), pp. 403-407, 2019, ISSN: 2377-5475.
@inproceedings{Giorgi19-mecoherta,
title = {Embedded Face Analysis for Smart Videosurveillance},
author = {Oro David Giorgi Roberto and Sara Ermini and Francesco Montefoschi and Antonio Rizzo},
doi = {10.1109/MECO.2019.8760200},
issn = {2377-5475},
year = {2019},
date = {2019-06-01},
booktitle = {IEEE 8th Mediterranean Conference on Embedded Computing (MECO)},
pages = {403-407},
abstract = {The native implementation of the N-point digital Fourier Transform involves calculating the scalar product of the sample buffer (treated as an N-dimensional vector) with N separate basis vectors. Since each scalar product involves N multiplications and N additions, the total time is proportional to (N^2), in other words, it’s an (O(N^2)) algorithm. However, it turns out that by cleverly re-arranging these operations, one can optimize the algorithm down to (O(N log_2 (N))), which for large N makes a huge difference. The optimized version of the algorithm is called the Fast Fourier Transform, or the FFT.
In this paper, we discuss about an efficient way to obtain Fast Fourier Transform algorithm (FFT).
According to our study, we can eliminate some operations in calculating the FFT algorithm thanks to property of complex numbers and we can achieve the FFT in a better execution time due to a significant reduction of $N/8$ of the needed twiddle factors and to additional factorizations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we discuss about an efficient way to obtain Fast Fourier Transform algorithm (FFT).
According to our study, we can eliminate some operations in calculating the FFT algorithm thanks to property of complex numbers and we can achieve the FFT in a better execution time due to a significant reduction of $N/8$ of the needed twiddle factors and to additional factorizations.
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
AXIOM: A Scalable, Efficient and Reconfigurable Embedded Platform Proceedings Article
In: IEEE Proceedings of Design, Automation and Test in Europe (DATE), pp. 1–6, Florence, Italy, 2019, ISBN: 978-3-9819263-3-0.
@inproceedings{Giorgi19-dateb,
title = {AXIOM: A Scalable, Efficient and Reconfigurable Embedded Platform},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.23919/DATE.2019.8715168},
isbn = {978-3-9819263-3-0},
year = {2019},
date = {2019-03-01},
booktitle = {IEEE Proceedings of Design, Automation and Test in Europe (DATE)},
pages = {1--6},
address = {Florence, Italy},
abstract = {Cyber-Physical Systems (CPSs) are becoming widely used in every application that requires interaction between humans and the physical environment. People expect this interaction to happen in real-time and this creates pressure onto system designs due to the ever-higher demand for data processing in the shortest possible and predictable time. Additionally, easy programmability, energy efficiency, and modular scalability are also important to ensure these systems to become widespread. All these requirements push new scientific and technological challenges towards the engineering community. The AXIOM project (Agile, eXtensible, fast I/O Module), presented in this paper, introduces a new hardware-software platform for CPS, which can provide an easy parallel programming model and fast connectivity, in order to scale-up performance by adding multiple boards. The AXIOM platform consists of a custom board based on a Xilinx Zynq Ultrascale+ ZU9EG SoC including
four 64-bit ARM cores, the Arduino socket and four high-speed (up to 18 Gbps) connectors on USB-C receptacles. By relying on this hardware, DF-Threads, a novel execution model based on
dataflow modality, has been developed and tested. In this paper, we highlight some major conclusions of the AXIOM project, such as the gain in performance compared to other parallel programming models such as OpenMPI and Cilk.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
four 64-bit ARM cores, the Arduino socket and four high-speed (up to 18 Gbps) connectors on USB-C receptacles. By relying on this hardware, DF-Threads, a novel execution model based on
dataflow modality, has been developed and tested. In this paper, we highlight some major conclusions of the AXIOM project, such as the gain in performance compared to other parallel programming models such as OpenMPI and Cilk.
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Analyzing the Impact of Operating System Activity of different Linux Distributions in a Distributed Environment Proceedings Article
In: IEEE Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, pp. 422-429, Pavia, Italy, 2019, ISBN: 978-1-7281-1644-0.
@inproceedings{Giorgi19-pdp,
title = {Analyzing the Impact of Operating System Activity of different Linux Distributions in a Distributed Environment},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1109/EMPDP.2019.8671562},
isbn = {978-1-7281-1644-0},
year = {2019},
date = {2019-02-01},
booktitle = {IEEE Euromicro International Conference on Parallel, Distributed, and Network-Based Processing},
pages = {422-429},
address = {Pavia, Italy},
abstract = {A rise in the number of threads in large-scale applications running on multi-node architectures makes operating system activity increasingly more relevant. Therefore, evaluation methodologies need to account for these activities. We decided to build our evaluation environment through the COTSon simulator. Moreover, our environment permits flexible Design Space Exploration (DSE) by making easy the management of many experiments and the characterizations of Operating System (OS) activity. In this paper, we show the result analysis tool flow and the OS impact of different Linux distributions running on a distributed environment consisting of several nodes with a full OS. In order to quantify our results, we use matrix multiplication benchmark executed through a DataFlow model, named DataFlow Threads(DF-Threads). We analyze key metrics like L2 cache miss rate, execution cycles, data access latency, and kernel cycles showing up to 60% performance variations among the different OS distributions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
A Design Space Exploration Tool Set for Future 1K-core High-Performance Computers Proceedings Article
In: ACM Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools (RAPIDO), pp. 1–6, Valencia, Spain, 2019, ISBN: 978-1-4503-6260-3.
@inproceedings{Giorgi19-rapido,
title = {A Design Space Exploration Tool Set for Future 1K-core High-Performance Computers},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1145/3300189.3300195},
isbn = {978-1-4503-6260-3},
year = {2019},
date = {2019-01-01},
booktitle = {ACM Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools (RAPIDO)},
pages = {1--6},
address = {Valencia, Spain},
abstract = {Given the constantly growing complexity of multi-core architectures, Design Space Exploration (DSE) tools play an important role to evaluate different design options. In this paper, we present a DSE toolset targeting massively parallelized HW/SW architectures with a high degree of flexibility in order to successfully simulate multi-core-multi-node platforms. Our DSE tools provide a rapid and simple-to-use work-flow to easily retrieve and analyze the key metrics and eventually evaluate the design. We examine the DSE toolset and methodology while performing several simulations of a
general purpose 1K-core architecture and evaluate not only standard metrics like the L2 cache miss rates, but also operating system activity and its impact. We leverage the knowledge gained in our methodology to develop and evaluate a novel dataflow execution model named “DataFlow-Threads” (DF-Threads). We validated the outcomes of the simulator against an equivalent FPGA-based design.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
general purpose 1K-core architecture and evaluate not only standard metrics like the L2 cache miss rates, but also operating system activity and its impact. We leverage the knowledge gained in our methodology to develop and evaluate a novel dataflow execution model named “DataFlow-Threads” (DF-Threads). We validated the outcomes of the simulator against an equivalent FPGA-based design.
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Energy Efficiency Exploration on the ZYNQ Ultrascale+ Proceedings Article
In: IEEE Proceedings of the 30th International Conference on Microelectronics (ICM), pp. 52-55, Sousse, Tunisia, 2018, ISBN: 978-1-5386-8166-4.
@inproceedings{Giorgi18-icm,
title = {Energy Efficiency Exploration on the ZYNQ Ultrascale+},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
isbn = {978-1-5386-8166-4},
year = {2018},
date = {2018-12-01},
booktitle = {IEEE Proceedings of the 30th International Conference on Microelectronics (ICM)},
pages = {52-55},
address = {Sousse, Tunisia},
abstract = {In the context of Cyber-Physical Systems (CPSs), Single Board Computers (SBCs) could provide adaptivity for various present and future applications, and permit scalability through clusters of SBCs while possibly save energy consumption. In this paper, we explore energy efficiency of a Zynq Ultrascale+ based board developed in the context of the AXIOM project. While an entire framework based on the Zynq Ultrascale+ is still in progress, the board is already available and capable of running a full Linux OS and it is possible to measure energy consumption. We demonstrate a possible architecture based on DataFlow-Threads (DF-Threads), a novel execution model, on the Zynq Ultrascale+ platform, in order to assess the energy efficiency of DF-Threads. We measured the power consumption, while the RAW and RDMA message types were transceived through board-to-board interconnects.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Antonio; Caporali, Maurizio; Montefoschi, Francesco; Ermini, Sara; Oro, David; Hupont, Isabelle; Bettin, Nicola
Prototyping Edge Computing Services for IoT Unpublished Forthcoming
Forthcoming.
@unpublished{Rizzo2018,
title = {Prototyping Edge Computing Services for IoT},
author = {Antonio Rizzo and Maurizio Caporali and Francesco Montefoschi and Sara Ermini and David Oro and Isabelle Hupont and Nicola Bettin},
editor = {ecce2018},
year = {2018},
date = {2018-09-05},
keywords = {},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
hao Xu, Ying; Vidal, Miquel; Arejita, Beñat; Diaz, Javier; Alvarez, Carlos; Jiménez-González, Daniel; Martorell, Xavier; Mantovani, Filippo
Implementation of the K-Means Algorithm on Heterogeneous Devices: A Use Case Based on an Industrial Dataset Proceedings Article
In: Advances in Parallel Computing, pp. 642-651, IOS Press, 2018, ISBN: 0927-5452.
@inproceedings{haoXu2018,
title = {Implementation of the K-Means Algorithm on Heterogeneous Devices: A Use Case Based on an Industrial Dataset},
author = {Ying hao Xu and Miquel Vidal and Be\~{n}at Arejita and Javier Diaz and Carlos Alvarez and Daniel Jim\'{e}nez-Gonz\'{a}lez and Xavier Martorell and Filippo Mantovani},
url = {https://upcommons.upc.edu/handle/2117/114842},
doi = {DOI:10.3233/978-1-61499-843-3-642},
isbn = {0927-5452},
year = {2018},
date = {2018-01-01},
booktitle = {Advances in Parallel Computing},
pages = {642-651},
publisher = {IOS Press},
abstract = {This paper presents and analyzes a heterogeneous implementation of an industrial use case based on K-means that targets symmetric multiprocessing (SMP), GPUs and FPGAs. We present how the application can be optimized from an algorithmic point of view and how this optimization performs on two heterogeneous platforms. The presented implementation relies on the OmpSs programming model, which introduces a simplified pragma-based syntax for the communication between the main processor and the accelerators. Performance improvement can be achieved by the programmer explicitly specifying the data memory accesses or copies. As expected, the newer SMP+GPU system studied is more powerful than the older SMP+FPGA system. However the latter is enough to fulfill the requirements of our use case and we show that uses less energy when considering only the active power of the execution.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Vasileios, Amourgianos-Lorentzos
Efficient network interface design for low cost distributed systems Masters Thesis
2017.
@mastersthesis{Vasileios2017,
title = {Efficient network interface design for low cost distributed systems},
author = {Amourgianos-Lorentzos Vasileios},
url = {http://purl.tuc.gr/dl/dias/88E308FB-EB66-4201-8559-C7CC96ED6895},
year = {2017},
date = {2017-10-06},
abstract = {Cyber-Physical Systems (CPSs) are widely used in many applications that require physical inputs and outputs. A CPS usually combines a set of hardware-software components to achieve optimal application execution in terms of performance and energy consumption. An important ability and requirement of CPSs is to be scalable through modularity, making them cost effective and providing enough computational power for the assigned tasks.This thesis is the result of our efforts to design and implement a cost-effective yet efficient Network Interface (NI) for the AXIOM CPS board and system to achieve said modularity. The NI has to achieve high throughput, with remote access to the memory, and several different transfer types to accommodate the project's needs. It also has to be cost effective, with the minimum possible usage of the board's available resources and follow certain guidelines for easy connectivity.},
keywords = {},
pubstate = {published},
tppubtype = {mastersthesis}
}
Rizzo, Antonio; Montefoschi, Francesco; Caporali, Maurizio; Gisondi, Antonio; Burresi, Giovanni; Giorgi, Roberto
Rapid prototyping IoT solutions based on Machine Learning Proceedings Article
In: Conference: the European Conference, 2017.
@inproceedings{Rizzo2017,
title = {Rapid prototyping IoT solutions based on Machine Learning},
author = {Antonio Rizzo and Francesco Montefoschi and Maurizio Caporali and Antonio Gisondi and Giovanni Burresi and Roberto Giorgi},
url = {https://www.researchgate.net/publication/320510026_Rapid_prototyping_IoT_solutions_based_on_Machine_Learning},
doi = {DOI: 10.1145/3121283.3121291 },
year = {2017},
date = {2017-09-01},
booktitle = {Conference: the European Conference},
abstract = {Nowadays Machine Learning (ML) has reached an all-time high, and this is evident by considering the increasing number of successful start-ups, applications and services in this domain. ML techniques are being developed and applied to an ever-growing range of fields, from on-demand delivery to smart home. Nevertheless, these solutions are failing at getting mainstream adoption among interaction designers due to high complexity. In this paper we present the integration of two Machine Learning algorithms into UAPPI, our open source extension of the prototyping environment MIT App Inventor. In UAPPI much of the complexity related to ML has been abstracted away, providing easy-to-use graphical blocks for rapid prototyping Internet of Things solutions. We report on limits and opportunities emerged from the first two scenario-based explorations of our design process. },
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Giorgi, Roberto
AXIOM: A 64-bit reconfigurable hardware/software platform for scalable embedded computing Proceedings Article
In: Embedded Computing (MECO), 2017 6th Mediterranean Conference on, IEEE, 2017, ISBN: 978-1-5090-6742-8.
@inproceedings{Giorgi2017,
title = {AXIOM: A 64-bit reconfigurable hardware/software platform for scalable embedded computing},
author = {Roberto Giorgi},
url = {http://ieeexplore.ieee.org/abstract/document/7977173/},
doi = {10.1109/MECO.2017.7977173},
isbn = {978-1-5090-6742-8},
year = {2017},
date = {2017-07-13},
booktitle = {Embedded Computing (MECO), 2017 6th Mediterranean Conference on},
publisher = {IEEE},
abstract = {The AXIOM platform is built with, in mind, the possibility of executing an application not only on a single board but also, in a distributed fashion, on multiple boards. While this is a classic problem with some solutions in the case of no constraints, it becomes interesting for embedded computing and cyber-physical systems where we aim to accelerate applications while maintaining energy efficiency and also easy programmability. Currently, the AXIOM platform consists of a custom board based on the Xilinx Zynq Ultrascale+ ZU9EG which incorporates the largest FPGA available on that System-on-Chip at the moment, four 64-bit ARM cores and two 32-bit ARM cores, up to 32GiB of main memory and several 12.5Gbit/s tranceivers. We relyed on this hardware to develop our novel concept, which exploits dataflow execution in multiple ways for programs that are written in an OpenMP extension, known as OmpSs. A key aspect relates to the adopted memory consistency model, which allows the programmer to focus on aspects other than taking care of the communication among nodes. The lower level of our communication stack relies on a fast interconnect based on inexpensive USB-C type connectors rather than on other proprietary interfaces. The reconfigurable logic provides a complete Network Interface Card (NIC) to allow fast routing of the data and code of the system. We envision many applications for this platform although we are currently focused on developing two basic scenarios based on the Smart-Home and on Smart-Video surveillance. Our initial results confirm good scalability of the platform and a speed-up compared to other programming models such as Cilk and OpenMPI.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Pons, Jaume Bosch
Asynchronous runtime for task-based dataflow programming models Masters Thesis
2017.
@mastersthesis{Pons2017,
title = {Asynchronous runtime for task-based dataflow programming models},
author = {Jaume Bosch Pons},
year = {2017},
date = {2017-07-01},
abstract = {The importance of parallel programming is increasing year after year since the power wall popularized multi-core processors, and with them, shared memory parallel programming models. In particular, task-based programming models, like the standard OpenMP 4.0, have become more and more important. They allow describing a set of data dependences per task that the runtime uses to order the execution of tasks. This order is calculated using shared graphs, which are updated by all threads but in exclusive access using synchronization mechanisms (locks) to ensure the dependences correctness. Although exclusive accesses are necessary to avoid data race conditions, those may imply contention that limits the application parallelism. This becomes critical in many-core systems because several threads may be wasting computation resources waiting to access the runtime structures. This master thesis introduces the concept of an asynchronous runtime management suitable for task-based programming model runtimes. The runtime proposal is based on the asynchronous management of the runtime structures like task dependence graphs. Therefore, the application threads request actions to the runtime instead of directly executing the needed modifications. The requests are then handled by a runtime manager which can be implemented in different ways. This master thesis presents an extension to a previously implemented centralized runtime manager and presents a novel implementation of a distributed runtime manager. On one hand, the runtime design based on a centralized manager [1] is extended to dynamically adapt the runtime behavior according to the manager load with the objective of being as fast as possible. On the other hand, a novel runtime design based on a distributed manager implementation is proposed to overcome the limitations observed in the centralized design. The distributed runtime implementation allows any thread to become a runtime manager thread if it helps to exploit the application parallelism. That is achieved using a new runtime feature, also implemented in this master thesis, for runtime functionality dispatching through a callback system. The proposals are evaluated in different many-core architectures and their performance is compared against the baseline runtimes used to implement the asynchronous versions. Results show that the centralized manager extension can overcome the hard limitations of the initial basic implementation, that the distributed manager fixes the observed problems in previous implementation, and the proposed asynchronous organization significantly outperforms the speedup obtained by the original runtime for real benchmarks.},
keywords = {},
pubstate = {published},
tppubtype = {mastersthesis}
}
Theodoropoulos, Dimitris; Mazumdar, Somnath; Ayguade, Eduard; Bettin, Nicola; Bueno, Javier; Ermini, Sara; Filgueras, Antonio; Jiménez-González, Daniel; Martínez, Carlos Álvarez; Martorell, Xavier; Montefoschi, Francesco; Oro, David; Pnevmatikatos, Dionisis; Rizzo, Antonio; Gai, Paolo; Garzarella, Stefano; Morelli, Bruno; Pomella, Alberto; Giorgi, Roberto
The AXIOM Platform for Next-generation Cyber Physical Systems Book Section
In: B.V, Elsevier (Ed.): Microprocessors and Microsystems, Elsevier B.V, 2017.
@incollection{Theodoropoulos2017,
title = {The AXIOM Platform for Next-generation Cyber Physical Systems},
author = {Dimitris Theodoropoulos and Somnath Mazumdar and Eduard Ayguade and Nicola Bettin and Javier Bueno and Sara Ermini and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Carlos \'{A}lvarez Mart\'{i}nez and Xavier Martorell and Francesco Montefoschi and David Oro and Dionisis Pnevmatikatos and Antonio Rizzo and Paolo Gai and Stefano Garzarella and Bruno Morelli and Alberto Pomella and Roberto Giorgi},
editor = {Elsevier B.V},
url = {https://www.researchgate.net/publication/317332095_The_AXIOM_Platform_for_Next-generation_Cyber_Physical_Systems},
doi = {10.1016/j.micpro.2017.05.018},
year = {2017},
date = {2017-06-03},
booktitle = {Microprocessors and Microsystems},
publisher = {Elsevier B.V},
abstract = {Cyber-Physical Systems (CPSs) are widely used in many applications that require interactions between humans and their physical environment. These systems usually integrate a set of hardware-software components for optimal application execution in terms of performance and energy consumption. The AXIOM project (Agile, eXtensible, fast I/O Module), presented in this paper, proposes a hardware-software platform for CPS coupled with an easy parallel programming model and sufficient connectivity so that the performance can scale-up by adding multiple boards. AXIOM supports a task-based programming model based on OmpSs and leverages a high-speed, inexpensive communication interface called AXIOM-Link. The board also tightly couples the CPU with reconfigurable resources to accelerate portions of the applications. As case studies, AXIOM uses smart video surveillance, and smart home living applications.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
Wagner, Michael; Llort, Germán; Filgueras, Antonio; Jiménez-González, Daniel; Servat, Harald; Teruel, Xavier; Mercadal, Estanislao; Álvarez, Carlos; Giménez, Judit; Martorell, Xavier; Ayguadé, Eduard; Labarta, Jesús
Monitoring Heterogeneous Applications with the OpenMP Tools Interface Proceedings Article
In: Springer, (Ed.): Tools for High Performance Computing 2016, pp. 41-57, Springer, 2017.
@inproceedings{Wagner2017,
title = {Monitoring Heterogeneous Applications with the OpenMP Tools Interface},
author = {Michael Wagner and Germ\'{a}n Llort and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Harald Servat and Xavier Teruel and Estanislao Mercadal and Carlos \'{A}lvarez and Judit Gim\'{e}nez and Xavier Martorell and Eduard Ayguad\'{e} and Jes\'{u}s Labarta},
editor = {Springer},
url = {https://link.springer.com/chapter/10.1007/978-3-319-56702-0_3},
doi = {10.1007/978-3-319-56702-0_3},
year = {2017},
date = {2017-05-09},
booktitle = {Tools for High Performance Computing 2016},
journal = {Tools for High Performance Computing 2016},
pages = {41-57},
publisher = {Springer},
abstract = {Heterogeneous systems are gaining more importance in supercomputing, yet they are challenging to program and developers require support tools to understand how well their accelerated codes perform and how they can be improved. The OpenMP Tools Interface (OMPT) is a new performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows monitoring the execution of heterogeneous OpenMP applications by revealing the activity of the runtime through a standardized API as well as facilitating the exchange of performance information between devices with accelerated codes, and the analysis tool. In this paper we describe our efforts implementing parts of the OMPT specification necessary to monitor accelerators. In particular, the integration of the OMPT features to our parallel runtime system and instrumentation framework helps to obtain detailed performance information about the execution of the accelerated tasks issued to the devices to allow an insightful analysis. As a result of this analysis, the parallel runtime of the programming model has been improved. We focus on the evaluation of monitoring FPGA devices studying the performance of a common kernel in scientific algorithms: matrix multiplication. Nonetheless, this development is as well applicable to monitor GPU accelerators and Intel®; Xeon PhiTM co-processors operating under the OmpSs programming model.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Antonio; Burresi, Giovanni; Montefoschi, Francesco; Caporali, Maurizio; Giorgi, Roberto
Making IoT with UDOO Journal Article
In: Interaction Design and Architecture(s), vol. 1, no. 30, pp. 95-112, 2016, ISSN: 1826-9745.
@article{Rizzo2016,
title = {Making IoT with UDOO},
author = {Antonio Rizzo and Giovanni Burresi and Francesco Montefoschi and Maurizio Caporali and Roberto Giorgi},
editor = {Scuola IaD},
url = {http://www.mifav.uniroma2.it/inevent/events/idea2010/doc/30_6.pdf},
issn = {1826-9745},
year = {2016},
date = {2016-12-01},
journal = {Interaction Design and Architecture(s)},
volume = {1},
number = {30},
pages = {95-112},
abstract = {The advent of massively interconnected objects, devices, and sensors raises equally substantial challenges regarding the resources that will allow makers to manage the complexity of such systems and to exploit the opportunities such technologies open up. Simplicity in management and a smooth, creative integration of everyday life objects empowered by digital technology in our own environment are two key factors for a successful penetration of Internet of Things (IoT). We present UDOO IoT, a combined set of open hardware (UDOO Quad, Blu and Bricks) and open software (UAPPI, an extension of MIT App Inventor) technologies that allow novices from their early steps in the makers world to create their own digital objects connected to the cloud, easily defining custom behavior logic for sensors and actuators. UDOO IoT is illustrated through one of the field studies carried out along its design process.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto; Mazumdar, Somnath; Viola, Stefano; Gai, Paolo; Garzarella, Stefano; Morelli, Bruno; Dionisios, Pnevmatikatos; Theodoropoulos, Dimitris; Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Antonio, Filgueras; Jimenez-Gonzalez, Daniel; Martorell, Xavier
Modeling Multi-Board Communication in the AXIOM Cyber-Physical System Journal Article
In: Ada User Journal, vol. 37, no. 4, pp. 228-235, 2016, ISSN: 1381-6551.
@article{Giorgi2016,
title = {Modeling Multi-Board Communication in the AXIOM Cyber-Physical System},
author = {Giorgi, Roberto and Mazumdar, Somnath and Viola, Stefano and Gai, Paolo and Garzarella, Stefano and Morelli, Bruno and Pnevmatikatos Dionisios and Theodoropoulos, Dimitris and Alvarez, Carlos and Ayguade, Eduard and Bueno, Javier and Filgueras Antonio and Jimenez-Gonzalez, Daniel and Martorell, Xavier},
issn = {1381-6551},
year = {2016},
date = {2016-12-01},
journal = {Ada User Journal},
volume = {37},
number = {4},
pages = {228-235},
abstract = {The main goal of the AXIOM project is to design a small board that could be used
as a LEGOtextsuperscript{TM}-style module to build systems with more performance
while keeping the programming task simple by using a familiar shared-memory
programming model. The interconnection plays a crucial role both for the need of providing
fast and reliable communication (including lossless control flow
as, e.g., Infiniband, but with a simplified scope and cost). In this paper, we outline some of our initial choices and explore the performance of RDMA based mechanisms and interfaces, including the remote memory management behind the programming model. Our initial results show a potential for scaling the system as we use DF-Threads, good bandwidth for RDMA transfers, promising to scale once we use the OmpSs, programming model.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
as a LEGOtextsuperscript{TM}-style module to build systems with more performance
while keeping the programming task simple by using a familiar shared-memory
programming model. The interconnection plays a crucial role both for the need of providing
fast and reliable communication (including lossless control flow
as, e.g., Infiniband, but with a simplified scope and cost). In this paper, we outline some of our initial choices and explore the performance of RDMA based mechanisms and interfaces, including the remote memory management behind the programming model. Our initial results show a potential for scaling the system as we use DF-Threads, good bandwidth for RDMA transfers, promising to scale once we use the OmpSs, programming model.
2019
Filgueras, A; Vidal, M; Mateu, M; Jiménez-González, D; Álvarez, C; Martorell, X; Ayguadé, E; Theodoropoulos, D; Pnevmatikatos, D; Gai, P; Garzarella, S; Oro, D; Hernando, J; Bettin, N; Pomella, A; Procaccini, M; Giorgi, R
The AXIOM Project: IoT on Heterogeneous Embedded Platforms Journal Article
In: IEEE Design and Test, vol. pre-print, pp. 1-6, 2019, ISSN: 2168-2356.
@article{Filgueras19-ieee_dnt,
title = {The AXIOM Project: IoT on Heterogeneous Embedded Platforms},
author = {A Filgueras and M Vidal and M Mateu and D Jim\'{e}nez-Gonz\'{a}lez and C \'{A}lvarez and X Martorell and E Ayguad\'{e} and D Theodoropoulos and D Pnevmatikatos and P Gai and S Garzarella and D Oro and J Hernando and N Bettin and A Pomella and M Procaccini and R Giorgi},
doi = {10.1109/MDAT.2019.2952335},
issn = {2168-2356},
year = {2019},
date = {2019-11-01},
journal = {IEEE Design and Test},
volume = {pre-print},
pages = {1-6},
abstract = {The AXIOM project aims at providing an environment for Cyber-Physical Systems. Smart Video Surveillance targets public environments, involving real-time face detection in crowds. Smart Home Living targets home environments and access control. These applications are used as experimental usecases for the AXIOM platform, currently based on the Xilinx Zynq-7000 SoCs. We have integrated the Xilinx Vivado HLS tool for the FPGA support within the OmpSs programming model, to enable OpenMP-like programming in the FPGA. This paper presents the programming environment, and the evaluation of the most computationally expensive parts of the target applications.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Translating Timing into an Architecture: the Synergy of COTSon and HLS (Domain Expertise: Designing a Computer Architecture via HLS) Journal Article
In: pp. 1–18, 2019, ISSN: 1687-7209.
@article{Giorgi19-ijrc,
title = {Translating Timing into an Architecture: the Synergy of COTSon and HLS (Domain Expertise: Designing a Computer Architecture via HLS)},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1155/2019/2624938},
issn = {1687-7209},
year = {2019},
date = {2019-09-01},
booktitle = {International Journal of Reconfigurable Computing},
pages = {1--18},
address = {London, UK},
abstract = {Translating a system requirement into a low-level representation (e.g., register transfer level or RTL) is the typical goal of the design of FPGA based systems. However, the Design Space Exploration (DSE) needed to identify the final architecture may be time consuming, even when using High Level Synthesis (HLS) tools.
In this paper, we illustrate our hybrid methodology, which uses a frontend to HLS so that the DSE is performed more rapidly by using a higher-level abstraction, but without loosing accuracy, thanks to the HP-Labs COTSon simulation infrastructure in combination with our DSE tools (MYDSE tools). In particular, this proposed methodology proved useful to achieve appropriate design of a whole system in a shorter time than trying to design everything directly in HLS.
Our motivating problem was to deploy a novel execution model called Data-Flow Threads (DF-Threads) running on yet to be designed hardware. For that goal, directly using the HLS was too premature in the design cycle. Therefore, a key point of our methodology consists in defining the first prototype in our simulation framework and gradually migrating the design into the Xilinx HLS after validating the key performance metrics of our novel system in the simulator.
To explain this workflow, we first use a simple driving example consisting in the modelling of a two-way associative cache. Then, we explain how we generalized this methodology and describe the types of results that we were able to analyze in the AXIOM project, that helped us reduce the development time from months/weeks to days/hours.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
In this paper, we illustrate our hybrid methodology, which uses a frontend to HLS so that the DSE is performed more rapidly by using a higher-level abstraction, but without loosing accuracy, thanks to the HP-Labs COTSon simulation infrastructure in combination with our DSE tools (MYDSE tools). In particular, this proposed methodology proved useful to achieve appropriate design of a whole system in a shorter time than trying to design everything directly in HLS.
Our motivating problem was to deploy a novel execution model called Data-Flow Threads (DF-Threads) running on yet to be designed hardware. For that goal, directly using the HLS was too premature in the design cycle. Therefore, a key point of our methodology consists in defining the first prototype in our simulation framework and gradually migrating the design into the Xilinx HLS after validating the key performance metrics of our novel system in the simulator.
To explain this workflow, we first use a simple driving example consisting in the modelling of a two-way associative cache. Then, we explain how we generalized this methodology and describe the types of results that we were able to analyze in the AXIOM project, that helped us reduce the development time from months/weeks to days/hours.
Giorgi, Roberto; Procaccini, Marco
Bridging a Data-Flow Execution Model to a Simple Programming Model Proceedings Article
In: IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS), pp. 165-168, Dublin, Ireland, 2019, ISBN: 978-1-7281-4484-9.
@inproceedings{Giorgi19-hpcsb,
title = {Bridging a Data-Flow Execution Model to a Simple Programming Model},
author = {Roberto Giorgi and Marco Procaccini},
doi = {10.1109/HPCS48598.2019.9188183},
isbn = {978-1-7281-4484-9},
year = {2019},
date = {2019-07-01},
booktitle = {IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS)},
pages = {165-168},
address = {Dublin, Ireland},
abstract = {Starting from a Data-Flow execution model called ``DF-Threads'',
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.
Giorgi, Roberto; Procaccini, Marco
Bridging a Data-Flow Execution Model to a Simple Programming Model Proceedings Article
In: IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS), pp. 165-168, Dublin, Ireland, 2019, ISBN: 978-1-7281-4484-9.
@inproceedings{Giorgi19-hpcs,
title = {Bridging a Data-Flow Execution Model to a Simple Programming Model},
author = {Roberto Giorgi and Marco Procaccini},
doi = {10.1109/HPCS48598.2019.9188183},
isbn = {978-1-7281-4484-9},
year = {2019},
date = {2019-07-01},
booktitle = {IEEE Proc. of the International Conference on High Performance Computing and Simulation (HPCS)},
pages = {165-168},
address = {Dublin, Ireland},
abstract = {Starting from a Data-Flow execution model called ``DF-Threads'',
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
we defined a minimalistic API to enable an efficient implementation in the hardware
of the distribution of the threads across the cores of a single multi-core system
and across the remote cores of a cluster.
We aim at proposing this API as a simple programming model in C language
that can potentially permit an easy interface between DF-Threads and generic programming models.
Clusters are typically programmed with MPI, therefore we evaluated
our approach against OpenMPI. If we consider the delivered GFLOPS per core,
DF-Threads are also competitive in respect to CUDA. In the basic examples, that we used
in this initial investigation, DF-Threads achieve better performance-per-core
compared to OpenMPI and CUDA. In particular, OpenMPI has a large portion of OS-kernel
activity, which is slowing down its performance.
Roberto, Bettin Nicola Giorgi; Ermini, Sara; Montefoschi, Francesco; Rizzo, Antonio
An Iris+Voice Recognition Systemfor a Smart Doorbell Proceedings Article
In: IEEE 8th Mediterranean Conference on Embedded Computing (MECO), pp. 419-422, 2019, ISSN: 2377-5475.
@inproceedings{Giorgi19-mecoiris,
title = {An Iris+Voice Recognition Systemfor a Smart Doorbell},
author = {Bettin Nicola Giorgi Roberto and Sara Ermini and Francesco Montefoschi and Antonio Rizzo},
doi = {10.1109/MECO.2019.8760187},
issn = {2377-5475},
year = {2019},
date = {2019-06-01},
booktitle = {IEEE 8th Mediterranean Conference on Embedded Computing (MECO)},
pages = {419-422},
abstract = {In this paper, we describe our methodology for designing a smart doorbell system for the homes.
While the recent trend of big companies is to offer a home voice assistant, which can integrate
all possible services, including the recognition of the owner (or authorized people) at the house door,
privacy concerns and independence from a single service provider are requiring more freedom in the
choice of the ``smart objects'' that surround us.
The doorbell system is using both iris and voice recognition to verify the identity of the user
who rings at the door. Since there is the involvement of biometric data, this information has to be properly handled.
In particular, we designed our system in such a way that it can avoid to send or store any biometric data
to the cloud. Machine-learning algorithms are used to perform local computations, thus implementing
Edge-Computing analytics to determine the identity of the user, by combining both voice and iris biometrics.
The system is implemented on reconfigurable hardware in order to accelerate some of the most intensive
tasks and achieve enough performance at a reasonable power consumption.
Our tests confirm that, by using our architecture, the performance is about 5x the sequential case
and, at the same time, we reach about 7x less energy consumption.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
While the recent trend of big companies is to offer a home voice assistant, which can integrate
all possible services, including the recognition of the owner (or authorized people) at the house door,
privacy concerns and independence from a single service provider are requiring more freedom in the
choice of the ``smart objects'' that surround us.
The doorbell system is using both iris and voice recognition to verify the identity of the user
who rings at the door. Since there is the involvement of biometric data, this information has to be properly handled.
In particular, we designed our system in such a way that it can avoid to send or store any biometric data
to the cloud. Machine-learning algorithms are used to perform local computations, thus implementing
Edge-Computing analytics to determine the identity of the user, by combining both voice and iris biometrics.
The system is implemented on reconfigurable hardware in order to accelerate some of the most intensive
tasks and achieve enough performance at a reasonable power consumption.
Our tests confirm that, by using our architecture, the performance is about 5x the sequential case
and, at the same time, we reach about 7x less energy consumption.
Roberto, Oro David Giorgi; Ermini, Sara; Montefoschi, Francesco; Rizzo, Antonio
Embedded Face Analysis for Smart Videosurveillance Proceedings Article
In: IEEE 8th Mediterranean Conference on Embedded Computing (MECO), pp. 403-407, 2019, ISSN: 2377-5475.
@inproceedings{Giorgi19-mecoherta,
title = {Embedded Face Analysis for Smart Videosurveillance},
author = {Oro David Giorgi Roberto and Sara Ermini and Francesco Montefoschi and Antonio Rizzo},
doi = {10.1109/MECO.2019.8760200},
issn = {2377-5475},
year = {2019},
date = {2019-06-01},
booktitle = {IEEE 8th Mediterranean Conference on Embedded Computing (MECO)},
pages = {403-407},
abstract = {The native implementation of the N-point digital Fourier Transform involves calculating the scalar product of the sample buffer (treated as an N-dimensional vector) with N separate basis vectors. Since each scalar product involves N multiplications and N additions, the total time is proportional to (N^2), in other words, it’s an (O(N^2)) algorithm. However, it turns out that by cleverly re-arranging these operations, one can optimize the algorithm down to (O(N log_2 (N))), which for large N makes a huge difference. The optimized version of the algorithm is called the Fast Fourier Transform, or the FFT.
In this paper, we discuss about an efficient way to obtain Fast Fourier Transform algorithm (FFT).
According to our study, we can eliminate some operations in calculating the FFT algorithm thanks to property of complex numbers and we can achieve the FFT in a better execution time due to a significant reduction of $N/8$ of the needed twiddle factors and to additional factorizations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we discuss about an efficient way to obtain Fast Fourier Transform algorithm (FFT).
According to our study, we can eliminate some operations in calculating the FFT algorithm thanks to property of complex numbers and we can achieve the FFT in a better execution time due to a significant reduction of $N/8$ of the needed twiddle factors and to additional factorizations.
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
AXIOM: A Scalable, Efficient and Reconfigurable Embedded Platform Proceedings Article
In: IEEE Proceedings of Design, Automation and Test in Europe (DATE), pp. 1–6, Florence, Italy, 2019, ISBN: 978-3-9819263-3-0.
@inproceedings{Giorgi19-dateb,
title = {AXIOM: A Scalable, Efficient and Reconfigurable Embedded Platform},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.23919/DATE.2019.8715168},
isbn = {978-3-9819263-3-0},
year = {2019},
date = {2019-03-01},
booktitle = {IEEE Proceedings of Design, Automation and Test in Europe (DATE)},
pages = {1--6},
address = {Florence, Italy},
abstract = {Cyber-Physical Systems (CPSs) are becoming widely used in every application that requires interaction between humans and the physical environment. People expect this interaction to happen in real-time and this creates pressure onto system designs due to the ever-higher demand for data processing in the shortest possible and predictable time. Additionally, easy programmability, energy efficiency, and modular scalability are also important to ensure these systems to become widespread. All these requirements push new scientific and technological challenges towards the engineering community. The AXIOM project (Agile, eXtensible, fast I/O Module), presented in this paper, introduces a new hardware-software platform for CPS, which can provide an easy parallel programming model and fast connectivity, in order to scale-up performance by adding multiple boards. The AXIOM platform consists of a custom board based on a Xilinx Zynq Ultrascale+ ZU9EG SoC including
four 64-bit ARM cores, the Arduino socket and four high-speed (up to 18 Gbps) connectors on USB-C receptacles. By relying on this hardware, DF-Threads, a novel execution model based on
dataflow modality, has been developed and tested. In this paper, we highlight some major conclusions of the AXIOM project, such as the gain in performance compared to other parallel programming models such as OpenMPI and Cilk.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
four 64-bit ARM cores, the Arduino socket and four high-speed (up to 18 Gbps) connectors on USB-C receptacles. By relying on this hardware, DF-Threads, a novel execution model based on
dataflow modality, has been developed and tested. In this paper, we highlight some major conclusions of the AXIOM project, such as the gain in performance compared to other parallel programming models such as OpenMPI and Cilk.
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Analyzing the Impact of Operating System Activity of different Linux Distributions in a Distributed Environment Proceedings Article
In: IEEE Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, pp. 422-429, Pavia, Italy, 2019, ISBN: 978-1-7281-1644-0.
@inproceedings{Giorgi19-pdp,
title = {Analyzing the Impact of Operating System Activity of different Linux Distributions in a Distributed Environment},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1109/EMPDP.2019.8671562},
isbn = {978-1-7281-1644-0},
year = {2019},
date = {2019-02-01},
booktitle = {IEEE Euromicro International Conference on Parallel, Distributed, and Network-Based Processing},
pages = {422-429},
address = {Pavia, Italy},
abstract = {A rise in the number of threads in large-scale applications running on multi-node architectures makes operating system activity increasingly more relevant. Therefore, evaluation methodologies need to account for these activities. We decided to build our evaluation environment through the COTSon simulator. Moreover, our environment permits flexible Design Space Exploration (DSE) by making easy the management of many experiments and the characterizations of Operating System (OS) activity. In this paper, we show the result analysis tool flow and the OS impact of different Linux distributions running on a distributed environment consisting of several nodes with a full OS. In order to quantify our results, we use matrix multiplication benchmark executed through a DataFlow model, named DataFlow Threads(DF-Threads). We analyze key metrics like L2 cache miss rate, execution cycles, data access latency, and kernel cycles showing up to 60% performance variations among the different OS distributions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
A Design Space Exploration Tool Set for Future 1K-core High-Performance Computers Proceedings Article
In: ACM Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools (RAPIDO), pp. 1–6, Valencia, Spain, 2019, ISBN: 978-1-4503-6260-3.
@inproceedings{Giorgi19-rapido,
title = {A Design Space Exploration Tool Set for Future 1K-core High-Performance Computers},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
doi = {10.1145/3300189.3300195},
isbn = {978-1-4503-6260-3},
year = {2019},
date = {2019-01-01},
booktitle = {ACM Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools (RAPIDO)},
pages = {1--6},
address = {Valencia, Spain},
abstract = {Given the constantly growing complexity of multi-core architectures, Design Space Exploration (DSE) tools play an important role to evaluate different design options. In this paper, we present a DSE toolset targeting massively parallelized HW/SW architectures with a high degree of flexibility in order to successfully simulate multi-core-multi-node platforms. Our DSE tools provide a rapid and simple-to-use work-flow to easily retrieve and analyze the key metrics and eventually evaluate the design. We examine the DSE toolset and methodology while performing several simulations of a
general purpose 1K-core architecture and evaluate not only standard metrics like the L2 cache miss rates, but also operating system activity and its impact. We leverage the knowledge gained in our methodology to develop and evaluate a novel dataflow execution model named “DataFlow-Threads” (DF-Threads). We validated the outcomes of the simulator against an equivalent FPGA-based design.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
general purpose 1K-core architecture and evaluate not only standard metrics like the L2 cache miss rates, but also operating system activity and its impact. We leverage the knowledge gained in our methodology to develop and evaluate a novel dataflow execution model named “DataFlow-Threads” (DF-Threads). We validated the outcomes of the simulator against an equivalent FPGA-based design.
2018
Giorgi, Roberto; Khalili, Farnam; Procaccini, Marco
Energy Efficiency Exploration on the ZYNQ Ultrascale+ Proceedings Article
In: IEEE Proceedings of the 30th International Conference on Microelectronics (ICM), pp. 52-55, Sousse, Tunisia, 2018, ISBN: 978-1-5386-8166-4.
@inproceedings{Giorgi18-icm,
title = {Energy Efficiency Exploration on the ZYNQ Ultrascale+},
author = {Roberto Giorgi and Farnam Khalili and Marco Procaccini},
isbn = {978-1-5386-8166-4},
year = {2018},
date = {2018-12-01},
booktitle = {IEEE Proceedings of the 30th International Conference on Microelectronics (ICM)},
pages = {52-55},
address = {Sousse, Tunisia},
abstract = {In the context of Cyber-Physical Systems (CPSs), Single Board Computers (SBCs) could provide adaptivity for various present and future applications, and permit scalability through clusters of SBCs while possibly save energy consumption. In this paper, we explore energy efficiency of a Zynq Ultrascale+ based board developed in the context of the AXIOM project. While an entire framework based on the Zynq Ultrascale+ is still in progress, the board is already available and capable of running a full Linux OS and it is possible to measure energy consumption. We demonstrate a possible architecture based on DataFlow-Threads (DF-Threads), a novel execution model, on the Zynq Ultrascale+ platform, in order to assess the energy efficiency of DF-Threads. We measured the power consumption, while the RAW and RDMA message types were transceived through board-to-board interconnects.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Antonio; Caporali, Maurizio; Montefoschi, Francesco; Ermini, Sara; Oro, David; Hupont, Isabelle; Bettin, Nicola
Prototyping Edge Computing Services for IoT Unpublished Forthcoming
Forthcoming.
@unpublished{Rizzo2018,
title = {Prototyping Edge Computing Services for IoT},
author = {Antonio Rizzo and Maurizio Caporali and Francesco Montefoschi and Sara Ermini and David Oro and Isabelle Hupont and Nicola Bettin},
editor = {ecce2018},
year = {2018},
date = {2018-09-05},
keywords = {},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
hao Xu, Ying; Vidal, Miquel; Arejita, Beñat; Diaz, Javier; Alvarez, Carlos; Jiménez-González, Daniel; Martorell, Xavier; Mantovani, Filippo
Implementation of the K-Means Algorithm on Heterogeneous Devices: A Use Case Based on an Industrial Dataset Proceedings Article
In: Advances in Parallel Computing, pp. 642-651, IOS Press, 2018, ISBN: 0927-5452.
@inproceedings{haoXu2018,
title = {Implementation of the K-Means Algorithm on Heterogeneous Devices: A Use Case Based on an Industrial Dataset},
author = {Ying hao Xu and Miquel Vidal and Be\~{n}at Arejita and Javier Diaz and Carlos Alvarez and Daniel Jim\'{e}nez-Gonz\'{a}lez and Xavier Martorell and Filippo Mantovani},
url = {https://upcommons.upc.edu/handle/2117/114842},
doi = {DOI:10.3233/978-1-61499-843-3-642},
isbn = {0927-5452},
year = {2018},
date = {2018-01-01},
booktitle = {Advances in Parallel Computing},
pages = {642-651},
publisher = {IOS Press},
abstract = {This paper presents and analyzes a heterogeneous implementation of an industrial use case based on K-means that targets symmetric multiprocessing (SMP), GPUs and FPGAs. We present how the application can be optimized from an algorithmic point of view and how this optimization performs on two heterogeneous platforms. The presented implementation relies on the OmpSs programming model, which introduces a simplified pragma-based syntax for the communication between the main processor and the accelerators. Performance improvement can be achieved by the programmer explicitly specifying the data memory accesses or copies. As expected, the newer SMP+GPU system studied is more powerful than the older SMP+FPGA system. However the latter is enough to fulfill the requirements of our use case and we show that uses less energy when considering only the active power of the execution.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2017
Vasileios, Amourgianos-Lorentzos
Efficient network interface design for low cost distributed systems Masters Thesis
2017.
@mastersthesis{Vasileios2017,
title = {Efficient network interface design for low cost distributed systems},
author = {Amourgianos-Lorentzos Vasileios},
url = {http://purl.tuc.gr/dl/dias/88E308FB-EB66-4201-8559-C7CC96ED6895},
year = {2017},
date = {2017-10-06},
abstract = {Cyber-Physical Systems (CPSs) are widely used in many applications that require physical inputs and outputs. A CPS usually combines a set of hardware-software components to achieve optimal application execution in terms of performance and energy consumption. An important ability and requirement of CPSs is to be scalable through modularity, making them cost effective and providing enough computational power for the assigned tasks.This thesis is the result of our efforts to design and implement a cost-effective yet efficient Network Interface (NI) for the AXIOM CPS board and system to achieve said modularity. The NI has to achieve high throughput, with remote access to the memory, and several different transfer types to accommodate the project's needs. It also has to be cost effective, with the minimum possible usage of the board's available resources and follow certain guidelines for easy connectivity.},
keywords = {},
pubstate = {published},
tppubtype = {mastersthesis}
}
Rizzo, Antonio; Montefoschi, Francesco; Caporali, Maurizio; Gisondi, Antonio; Burresi, Giovanni; Giorgi, Roberto
Rapid prototyping IoT solutions based on Machine Learning Proceedings Article
In: Conference: the European Conference, 2017.
@inproceedings{Rizzo2017,
title = {Rapid prototyping IoT solutions based on Machine Learning},
author = {Antonio Rizzo and Francesco Montefoschi and Maurizio Caporali and Antonio Gisondi and Giovanni Burresi and Roberto Giorgi},
url = {https://www.researchgate.net/publication/320510026_Rapid_prototyping_IoT_solutions_based_on_Machine_Learning},
doi = {DOI: 10.1145/3121283.3121291 },
year = {2017},
date = {2017-09-01},
booktitle = {Conference: the European Conference},
abstract = {Nowadays Machine Learning (ML) has reached an all-time high, and this is evident by considering the increasing number of successful start-ups, applications and services in this domain. ML techniques are being developed and applied to an ever-growing range of fields, from on-demand delivery to smart home. Nevertheless, these solutions are failing at getting mainstream adoption among interaction designers due to high complexity. In this paper we present the integration of two Machine Learning algorithms into UAPPI, our open source extension of the prototyping environment MIT App Inventor. In UAPPI much of the complexity related to ML has been abstracted away, providing easy-to-use graphical blocks for rapid prototyping Internet of Things solutions. We report on limits and opportunities emerged from the first two scenario-based explorations of our design process. },
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Giorgi, Roberto
AXIOM: A 64-bit reconfigurable hardware/software platform for scalable embedded computing Proceedings Article
In: Embedded Computing (MECO), 2017 6th Mediterranean Conference on, IEEE, 2017, ISBN: 978-1-5090-6742-8.
@inproceedings{Giorgi2017,
title = {AXIOM: A 64-bit reconfigurable hardware/software platform for scalable embedded computing},
author = {Roberto Giorgi},
url = {http://ieeexplore.ieee.org/abstract/document/7977173/},
doi = {10.1109/MECO.2017.7977173},
isbn = {978-1-5090-6742-8},
year = {2017},
date = {2017-07-13},
booktitle = {Embedded Computing (MECO), 2017 6th Mediterranean Conference on},
publisher = {IEEE},
abstract = {The AXIOM platform is built with, in mind, the possibility of executing an application not only on a single board but also, in a distributed fashion, on multiple boards. While this is a classic problem with some solutions in the case of no constraints, it becomes interesting for embedded computing and cyber-physical systems where we aim to accelerate applications while maintaining energy efficiency and also easy programmability. Currently, the AXIOM platform consists of a custom board based on the Xilinx Zynq Ultrascale+ ZU9EG which incorporates the largest FPGA available on that System-on-Chip at the moment, four 64-bit ARM cores and two 32-bit ARM cores, up to 32GiB of main memory and several 12.5Gbit/s tranceivers. We relyed on this hardware to develop our novel concept, which exploits dataflow execution in multiple ways for programs that are written in an OpenMP extension, known as OmpSs. A key aspect relates to the adopted memory consistency model, which allows the programmer to focus on aspects other than taking care of the communication among nodes. The lower level of our communication stack relies on a fast interconnect based on inexpensive USB-C type connectors rather than on other proprietary interfaces. The reconfigurable logic provides a complete Network Interface Card (NIC) to allow fast routing of the data and code of the system. We envision many applications for this platform although we are currently focused on developing two basic scenarios based on the Smart-Home and on Smart-Video surveillance. Our initial results confirm good scalability of the platform and a speed-up compared to other programming models such as Cilk and OpenMPI.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Pons, Jaume Bosch
Asynchronous runtime for task-based dataflow programming models Masters Thesis
2017.
@mastersthesis{Pons2017,
title = {Asynchronous runtime for task-based dataflow programming models},
author = {Jaume Bosch Pons},
year = {2017},
date = {2017-07-01},
abstract = {The importance of parallel programming is increasing year after year since the power wall popularized multi-core processors, and with them, shared memory parallel programming models. In particular, task-based programming models, like the standard OpenMP 4.0, have become more and more important. They allow describing a set of data dependences per task that the runtime uses to order the execution of tasks. This order is calculated using shared graphs, which are updated by all threads but in exclusive access using synchronization mechanisms (locks) to ensure the dependences correctness. Although exclusive accesses are necessary to avoid data race conditions, those may imply contention that limits the application parallelism. This becomes critical in many-core systems because several threads may be wasting computation resources waiting to access the runtime structures. This master thesis introduces the concept of an asynchronous runtime management suitable for task-based programming model runtimes. The runtime proposal is based on the asynchronous management of the runtime structures like task dependence graphs. Therefore, the application threads request actions to the runtime instead of directly executing the needed modifications. The requests are then handled by a runtime manager which can be implemented in different ways. This master thesis presents an extension to a previously implemented centralized runtime manager and presents a novel implementation of a distributed runtime manager. On one hand, the runtime design based on a centralized manager [1] is extended to dynamically adapt the runtime behavior according to the manager load with the objective of being as fast as possible. On the other hand, a novel runtime design based on a distributed manager implementation is proposed to overcome the limitations observed in the centralized design. The distributed runtime implementation allows any thread to become a runtime manager thread if it helps to exploit the application parallelism. That is achieved using a new runtime feature, also implemented in this master thesis, for runtime functionality dispatching through a callback system. The proposals are evaluated in different many-core architectures and their performance is compared against the baseline runtimes used to implement the asynchronous versions. Results show that the centralized manager extension can overcome the hard limitations of the initial basic implementation, that the distributed manager fixes the observed problems in previous implementation, and the proposed asynchronous organization significantly outperforms the speedup obtained by the original runtime for real benchmarks.},
keywords = {},
pubstate = {published},
tppubtype = {mastersthesis}
}
Theodoropoulos, Dimitris; Mazumdar, Somnath; Ayguade, Eduard; Bettin, Nicola; Bueno, Javier; Ermini, Sara; Filgueras, Antonio; Jiménez-González, Daniel; Martínez, Carlos Álvarez; Martorell, Xavier; Montefoschi, Francesco; Oro, David; Pnevmatikatos, Dionisis; Rizzo, Antonio; Gai, Paolo; Garzarella, Stefano; Morelli, Bruno; Pomella, Alberto; Giorgi, Roberto
The AXIOM Platform for Next-generation Cyber Physical Systems Book Section
In: B.V, Elsevier (Ed.): Microprocessors and Microsystems, Elsevier B.V, 2017.
@incollection{Theodoropoulos2017,
title = {The AXIOM Platform for Next-generation Cyber Physical Systems},
author = {Dimitris Theodoropoulos and Somnath Mazumdar and Eduard Ayguade and Nicola Bettin and Javier Bueno and Sara Ermini and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Carlos \'{A}lvarez Mart\'{i}nez and Xavier Martorell and Francesco Montefoschi and David Oro and Dionisis Pnevmatikatos and Antonio Rizzo and Paolo Gai and Stefano Garzarella and Bruno Morelli and Alberto Pomella and Roberto Giorgi},
editor = {Elsevier B.V},
url = {https://www.researchgate.net/publication/317332095_The_AXIOM_Platform_for_Next-generation_Cyber_Physical_Systems},
doi = {10.1016/j.micpro.2017.05.018},
year = {2017},
date = {2017-06-03},
booktitle = {Microprocessors and Microsystems},
publisher = {Elsevier B.V},
abstract = {Cyber-Physical Systems (CPSs) are widely used in many applications that require interactions between humans and their physical environment. These systems usually integrate a set of hardware-software components for optimal application execution in terms of performance and energy consumption. The AXIOM project (Agile, eXtensible, fast I/O Module), presented in this paper, proposes a hardware-software platform for CPS coupled with an easy parallel programming model and sufficient connectivity so that the performance can scale-up by adding multiple boards. AXIOM supports a task-based programming model based on OmpSs and leverages a high-speed, inexpensive communication interface called AXIOM-Link. The board also tightly couples the CPU with reconfigurable resources to accelerate portions of the applications. As case studies, AXIOM uses smart video surveillance, and smart home living applications.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
Wagner, Michael; Llort, Germán; Filgueras, Antonio; Jiménez-González, Daniel; Servat, Harald; Teruel, Xavier; Mercadal, Estanislao; Álvarez, Carlos; Giménez, Judit; Martorell, Xavier; Ayguadé, Eduard; Labarta, Jesús
Monitoring Heterogeneous Applications with the OpenMP Tools Interface Proceedings Article
In: Springer, (Ed.): Tools for High Performance Computing 2016, pp. 41-57, Springer, 2017.
@inproceedings{Wagner2017,
title = {Monitoring Heterogeneous Applications with the OpenMP Tools Interface},
author = {Michael Wagner and Germ\'{a}n Llort and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Harald Servat and Xavier Teruel and Estanislao Mercadal and Carlos \'{A}lvarez and Judit Gim\'{e}nez and Xavier Martorell and Eduard Ayguad\'{e} and Jes\'{u}s Labarta},
editor = {Springer},
url = {https://link.springer.com/chapter/10.1007/978-3-319-56702-0_3},
doi = {10.1007/978-3-319-56702-0_3},
year = {2017},
date = {2017-05-09},
booktitle = {Tools for High Performance Computing 2016},
journal = {Tools for High Performance Computing 2016},
pages = {41-57},
publisher = {Springer},
abstract = {Heterogeneous systems are gaining more importance in supercomputing, yet they are challenging to program and developers require support tools to understand how well their accelerated codes perform and how they can be improved. The OpenMP Tools Interface (OMPT) is a new performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows monitoring the execution of heterogeneous OpenMP applications by revealing the activity of the runtime through a standardized API as well as facilitating the exchange of performance information between devices with accelerated codes, and the analysis tool. In this paper we describe our efforts implementing parts of the OMPT specification necessary to monitor accelerators. In particular, the integration of the OMPT features to our parallel runtime system and instrumentation framework helps to obtain detailed performance information about the execution of the accelerated tasks issued to the devices to allow an insightful analysis. As a result of this analysis, the parallel runtime of the programming model has been improved. We focus on the evaluation of monitoring FPGA devices studying the performance of a common kernel in scientific algorithms: matrix multiplication. Nonetheless, this development is as well applicable to monitor GPU accelerators and Intel®; Xeon PhiTM co-processors operating under the OmpSs programming model.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
Rizzo, Antonio; Burresi, Giovanni; Montefoschi, Francesco; Caporali, Maurizio; Giorgi, Roberto
Making IoT with UDOO Journal Article
In: Interaction Design and Architecture(s), vol. 1, no. 30, pp. 95-112, 2016, ISSN: 1826-9745.
@article{Rizzo2016,
title = {Making IoT with UDOO},
author = {Antonio Rizzo and Giovanni Burresi and Francesco Montefoschi and Maurizio Caporali and Roberto Giorgi},
editor = {Scuola IaD},
url = {http://www.mifav.uniroma2.it/inevent/events/idea2010/doc/30_6.pdf},
issn = {1826-9745},
year = {2016},
date = {2016-12-01},
journal = {Interaction Design and Architecture(s)},
volume = {1},
number = {30},
pages = {95-112},
abstract = {The advent of massively interconnected objects, devices, and sensors raises equally substantial challenges regarding the resources that will allow makers to manage the complexity of such systems and to exploit the opportunities such technologies open up. Simplicity in management and a smooth, creative integration of everyday life objects empowered by digital technology in our own environment are two key factors for a successful penetration of Internet of Things (IoT). We present UDOO IoT, a combined set of open hardware (UDOO Quad, Blu and Bricks) and open software (UAPPI, an extension of MIT App Inventor) technologies that allow novices from their early steps in the makers world to create their own digital objects connected to the cloud, easily defining custom behavior logic for sensors and actuators. UDOO IoT is illustrated through one of the field studies carried out along its design process.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto; Mazumdar, Somnath; Viola, Stefano; Gai, Paolo; Garzarella, Stefano; Morelli, Bruno; Dionisios, Pnevmatikatos; Theodoropoulos, Dimitris; Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Antonio, Filgueras; Jimenez-Gonzalez, Daniel; Martorell, Xavier
Modeling Multi-Board Communication in the AXIOM Cyber-Physical System Journal Article
In: Ada User Journal, vol. 37, no. 4, pp. 228-235, 2016, ISSN: 1381-6551.
@article{Giorgi2016,
title = {Modeling Multi-Board Communication in the AXIOM Cyber-Physical System},
author = {Giorgi, Roberto and Mazumdar, Somnath and Viola, Stefano and Gai, Paolo and Garzarella, Stefano and Morelli, Bruno and Pnevmatikatos Dionisios and Theodoropoulos, Dimitris and Alvarez, Carlos and Ayguade, Eduard and Bueno, Javier and Filgueras Antonio and Jimenez-Gonzalez, Daniel and Martorell, Xavier},
issn = {1381-6551},
year = {2016},
date = {2016-12-01},
journal = {Ada User Journal},
volume = {37},
number = {4},
pages = {228-235},
abstract = {The main goal of the AXIOM project is to design a small board that could be used
as a LEGOtextsuperscript{TM}-style module to build systems with more performance
while keeping the programming task simple by using a familiar shared-memory
programming model. The interconnection plays a crucial role both for the need of providing
fast and reliable communication (including lossless control flow
as, e.g., Infiniband, but with a simplified scope and cost). In this paper, we outline some of our initial choices and explore the performance of RDMA based mechanisms and interfaces, including the remote memory management behind the programming model. Our initial results show a potential for scaling the system as we use DF-Threads, good bandwidth for RDMA transfers, promising to scale once we use the OmpSs, programming model.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
as a LEGOtextsuperscript{TM}-style module to build systems with more performance
while keeping the programming task simple by using a familiar shared-memory
programming model. The interconnection plays a crucial role both for the need of providing
fast and reliable communication (including lossless control flow
as, e.g., Infiniband, but with a simplified scope and cost). In this paper, we outline some of our initial choices and explore the performance of RDMA based mechanisms and interfaces, including the remote memory management behind the programming model. Our initial results show a potential for scaling the system as we use DF-Threads, good bandwidth for RDMA transfers, promising to scale once we use the OmpSs, programming model.
Giorgi, Roberto
Exploring Future Many-Core Architectures: The TERAFLUX Evaluation Framework Book Chapter
In: vol. Advances in Computers (ADV COMPUT), Elsevier, 2016, ISSN: 0065-2458.
@inbook{Giorgi2016c,
title = {Exploring Future Many-Core Architectures: The TERAFLUX Evaluation Framework},
author = {Roberto Giorgi},
doi = {DOI:10.1016/bs.adcom.2016.09.002},
issn = {0065-2458},
year = {2016},
date = {2016-10-01},
volume = {Advances in Computers (ADV COMPUT)},
publisher = {Elsevier},
abstract = {The design of new computer systems always requires a strong simulation effort in order to evaluate different design options. This is especially true if the system is to be produced at a date far in the future, such as in the case of TERAFLUX, a system aimed at containing something like 10¹² (1 TERA) transistors in a single package or a (multilayer) chip by 2020. At the basis of a TERAFLUX system, a dataflow execution model supports the execution of threads. In order to explore the design space, TERAFLUX provides an appropriate evaluation framework, at the scale of at least 1000 general purpose cores on a single chip. Predicting the performance of such a next-generation platform is not a trivial task. Today, no software-based tool exists that can provide cycle-level full-system simulation and faithfully predict the behavior of 1000 general-purpose cores, in an acceptable amount of time and with reasonable accuracy, while providing the flexibility of changing the execution model at the architectural level. A solid evaluation framework represents an important base for exploring future many cores. In this chapter, different options for simulating a 1000 general-purpose-core system are explored. Finally, we show the setup that successfully allowed us to evaluate our 1000 core target while running a full-system Linux operating system.},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Giorgi, Roberto; Bettin, Nicola; Gai, Paolo; Martorell, Xavier; Rizzo, Antonio
AXIOM: A Flexible Platform for the Smart Home Book Chapter
In: Keramidas, Georgios; Voros, Nikolaos; Hbner, Michael (Ed.): vol. Springer International Publishing, pp. 57-74, Springer International Publishing, Cham, 2016, ISBN: 978-3-319-42304-3.
@inbook{Giorgi2016b,
title = {AXIOM: A Flexible Platform for the Smart Home},
author = {Giorgi, Roberto and Bettin, Nicola and Gai, Paolo and Martorell, Xavier and Rizzo, Antonio},
editor = {Keramidas, Georgios and Voros, Nikolaos and Hbner, Michael},
url = {http://dx.doi.org/10.1007/978-3-319-42304-3_3},
doi = {10.1007/978-3-319-42304-3_3},
isbn = {978-3-319-42304-3},
year = {2016},
date = {2016-09-24},
journal = {Components and Services for IoT Platforms: Paving the Way for IoT Standards},
volume = {Springer International Publishing},
pages = {57-74},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {The AXIOM hardware/software platform aims at bringing easy programmability on top of a cluster of processors by using a fast interconnect and FPGA as a basis for building a scalable embedded system. The Smart Home is one of the key scenarios in which AXIOM could be useful for the Internet-of-Things (IoT). In Smart Homes, everything is linked to the flow of information that from the on the field devices needs to arrive to the cloud servers. The information sensed in the environment will not be transmitted as is to the higher layers, but is somehow interpreted to provide a synthetic light-weight representation of the environment. In such a scenario, it is then clear that there is a need for peripheral nodes as well as intermediate gateways which needs to be able to perform high-performance computational loads. AXIOM provides the possibility of designing a cluster of low-power/low-budget boards, which could be packed inside a high-performance embedded low-cost product. The AXIOM boards are heterogeneous, thus allowing for even greater diversity which is needed in those kind of IoT scenarios. The cluster itself can then be integrated inside the IoT architectures as computational-power node, which could be the center of a distributed intelligence near the edges of the IoT network.},
howpublished = {Springer International Publishing},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Llort, Germán; eras, Antonio Filgu; ménez-Gonzál ez, Daniel Ji; Servat, Harald; Teruel, Xavier; rcadal, Estanislao Me; z, Carlos Álvare; Giménez, Judit; ell, Xavier Martor; dé, Eduard Aygua; Labarta, Jesús
The Secrets of the Accelerators Unveiled: Tracing Heterogeneous Executions Through OMPT Proceedings
Springer International Publishing, vol. OpenMP: Memory, Devices and Tasks, 2016.
@proceedings{Llort2016,
title = {The Secrets of the Accelerators Unveiled: Tracing Heterogeneous Executions Through OMPT},
author = {Germán Llort and Antonio Filgu eras and Daniel Ji ménez-Gonzál ez and Harald Servat and Xavier Teruel and Estanislao Me rcadal and Carlos Álvare z and Judit Giménez and Xavier Martor ell and Eduard Aygua dé and Jesús Labarta},
url = {https://link.springer.com/chapter/10.1007/978-3-319-45550-1_16},
doi = {10.1007/97 8-3-319-45 550-1_16},
year = {2016},
date = {2016-09-21},
volume = {OpenMP: Memory, Devices and Tasks},
publisher = {Springer International Publishing},
abstract = {Heterogeneous systems are an important trend in the future of supercomputers, yet they can be hard to program and developers still lack powerful tools to gain understanding about how well their accelerated codes perform and how to improve them.
Having different types of hardware accelerators available, each with their own specific low-level APIs to program them, there is not yet a clear consensus on a standard way to retrieve information about the accelerator’s performance. To improve this scenario, OMPT is a novel performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows analysis tools to monitor the execution of parallel OpenMP applications by providing detailed information about the activity of the runtime through a standard API. For accelerated devices, OMPT also facilitates the exchange of performance information between the runtime and the analysis tool. We implement part of the OMPT specification that refers to the use of accelerators both in the Nanos++ parallel runtime system and the Extrae tracing framework, obtaining detailed performance information about the execution of the tasks issued to the accelerated devices to later conduct insightful analysis.
Our work extends previous efforts in the field to expose detailed information from the OpenMP and OmpSs runtimes, regarding the activity and performance of task-based parallel applications. In this paper, we focus on the evaluation of FPGA devices studying the performance of two common kernels in scientific algorithms: matrix multiplication and Cholesky decomposition. Furthermore, this development is seamlessly applicable for the analysis of GPGPU accelerators and Intel® Xeon PhiTM co-processors operating under the OmpSs programming model.},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Having different types of hardware accelerators available, each with their own specific low-level APIs to program them, there is not yet a clear consensus on a standard way to retrieve information about the accelerator’s performance. To improve this scenario, OMPT is a novel performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows analysis tools to monitor the execution of parallel OpenMP applications by providing detailed information about the activity of the runtime through a standard API. For accelerated devices, OMPT also facilitates the exchange of performance information between the runtime and the analysis tool. We implement part of the OMPT specification that refers to the use of accelerators both in the Nanos++ parallel runtime system and the Extrae tracing framework, obtaining detailed performance information about the execution of the tasks issued to the accelerated devices to later conduct insightful analysis.
Our work extends previous efforts in the field to expose detailed information from the OpenMP and OmpSs runtimes, regarding the activity and performance of task-based parallel applications. In this paper, we focus on the evaluation of FPGA devices studying the performance of two common kernels in scientific algorithms: matrix multiplication and Cholesky decomposition. Furthermore, this development is seamlessly applicable for the analysis of GPGPU accelerators and Intel® Xeon PhiTM co-processors operating under the OmpSs programming model.
Mazumdar, Somnath; Ayguade, Eduard; Bettin, Nicola; Bueno, Javier; Ermini, Sara; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martinez, Alvarez; Martorell, Xavier; Montefoschi, Francesco; Oro, David; Pnevmatikatos, Dionisis; Rizzo, Antonio; Theodoropoulos, Dimitris; Giorgi, Roberto
AXIOM: A Hardware-Software Platform for Cyber Physical Systems Journal Article
In: pp. 539–546, 2016, ISBN: 978-1-50 90-2817- 7.
@article{Mazumdar2016,
title = {AXIOM: A Hardware-Software Platform for Cyber Physical Systems},
author = {Mazumdar, Somnath and Ayguade, Eduard and Bettin, Nicola and
Bueno, Javier and Ermini, Sara and Filgueras, Antonio and
Jimenez-Gonzalez, Daniel and Martinez, Alvarez and Martorell, Xavier and
Montefoschi, Francesco and Oro, David and Pnevmatikatos, Dionisis and
Rizzo, Antonio and Theodoropoulos, Dimitris and Giorgi, Roberto},
doi = {10.1109/DSD.2016.80},
isbn = {978-1-50 90-2817- 7},
year = {2016},
date = {2016-09-07},
pages = {539--546},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Theodoropoulos, Dimitris; Pnevmatikatos, Dionisis; Garzarella, Stefano; Gai, Paolo; Rizzo, Antonio; Giorgi, Roberto
AXIOM: enabling parallel processing in cyber-physical systems. Proceedings Article
In: International Conference on Field-Programmable Logic and Applications, 2016.
@inproceedings{Theodoropoulos2016,
title = {AXIOM: enabling parallel processing in cyber-physical systems.},
author = {Dimitris Theodoropoulos and Dionisis Pnevmatikatos and Stefano Garzarella and Paolo Gai and Antonio Rizzo and Roberto Giorgi},
url = {http://fplwrc2016.cit-ec.uni-bielefeld.de/files/Dionisios_Pnevmatikatos.pdf},
year = {2016},
date = {2016-09-01},
booktitle = {International Conference on Field-Programmable Logic and Applications},
abstract = {The AXIOM project focuses on developing an affordable CPS node that features general purpose capability coupled with reconfigurable resources. The nodes will be interconnected and a programming layer will turn them into a parallel processing system. The programming layer also makes easier the use of the reconfigurable resources for accelerators.
Harnessing the combined CPS resources enables a new level of ”edge” processing. We will focus on the interconnection and modularity aspects of the project, and present the current status and the challenges we are facing mainly in performance and efficiency.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Harnessing the combined CPS resources enables a new level of ”edge” processing. We will focus on the interconnection and modularity aspects of the project, and present the current status and the challenges we are facing mainly in performance and efficiency.
Alvarez, Carlos; Ayguade, Eduard; Bosch, Jaume; Bueno, Javier; Cherkashin, Artem; Filgueras, Antonio; Jiminez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Vidal, Miquel; Theodoropoulos, Dimitris; Pnevmatikatos, Dionisios N.; Catani, Davide; Oro, David; Fernandez, Carles; Segura, Carlos; Rodriguez, Javier; Hernando, Javier; Scordino, Claudio; Gai, Paolo; Passera, Pierluigi; Pomella, Alberto; Bettin, Nicola; Rizzo, Antonio; Giorgi, Roberto
The AXIOM Software Layers Journal Article
In: "ELSEVIER Microprocessors and Microsystems", 2016, ISSN: 0141-9331.
@article{Alvarez2016,
title = {The AXIOM Software Layers},
author = {Carlos Alvarez and Eduard Ayguade and Jaume Bosch and Javier Bueno and Artem Cherkashin and Antonio Filgueras and Daniel Jiminez-Gonzalez and Xavier Martorell and Nacho Navarro and Miquel Vidal and Dimitris Theodoropoulos and Dionisios N. Pnevmatikatos and Davide Catani and David Oro and Carles Fernandez and Carlos Segura and Javier Rodriguez and Javier Hernando and Claudio Scordino and Paolo Gai and Pierluigi Passera and Alberto Pomella and Nicola Bettin and Antonio Rizzo and Roberto Giorgi},
url = {http://www.sciencedirect.com/science/article/pii/S0141933116300850},
doi = {10.1016/j.micpro.2016.07.002},
issn = {0141-9331},
year = {2016},
date = {2016-07-09},
journal = {"ELSEVIER Microprocessors and Microsystems"},
abstract = {Abstract People and objects will soon share the same digital network for information exchange in a world named as the age of the cyber-physical systems. The general expectation is that people and systems will interact in real-time. This poses pressure onto systems design to support increasing demands on computational power, while keeping a low power envelop. Additionally, modular scaling and easy programmability are also important to ensure these systems to become widespread. The whole set of expectations impose scientific and technological challenges that need to be properly addressed. The AXIOM project (Agile, eXtensible, fast I/O Module) will research new hardware/software architectures for cyber-physical systems to meet such expectations. The technical approach aims at solving fundamental problems to enable easy programmability of heterogeneous multi-core multi-board systems. AXIOM proposes the use of the task-based OmpSs programming model, leveraging low-level communication interfaces provided by the hardware. Modular scalability will be possible thanks to a fast interconnect embedded into each module. To this aim, an innovative ARM and FPGA-based board will be designed},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto
Exploring Dataflow-based Thread Level Parallelism in Cyber-physical Systems Proceedings Article
In: pp. 295-300, ACM, New York, NY, USA, 2016, ISBN: 978-1-4503-4128-8.
@inproceedings{Giorgi16c.bib,
title = {Exploring Dataflow-based Thread Level Parallelism in Cyber-physical Systems},
author = {Giorgi, Roberto},
url = {http://doi.acm.org/10.1145/2903150.2906829},
doi = {10.1145/2903150.2906829},
isbn = {978-1-4503-4128-8},
year = {2016},
date = {2016-05-16},
pages = {295-300},
publisher = {ACM},
address = {New York, NY, USA},
series = {CF '16},
abstract = {Smart Cyber-Physical Systems (SCPS) aim not only at integrating computational platforms and physical processes, but also at creating larger "systems of systems" capable of satisfying multiple critical constraints such as energy efficiency, high-performance, safety, security, size and cost.
The AXIOM project aims at designing such systems by focusing on low-cost Single Board Computers (SBC), based on current System-on-Chips (SoC) that include both programmable logic (FPGA), multi-core CPUs, accelerators and peripherals. A dataflow execution model, partially developed in the TERAFLUX project, brings a more predictable and reliable execution.
The goals of AXIOM include: i) the possibility to easily program the system with a shared-memory model based on OmpSs; ii) the possibility of scaling up the system through a custom but inexpensive interconnect; iii) the possibility of accelerating a specific function on a single or multiple FPGAs of the system.
The dataflow execution model operates at thread-level granularity. In this paper the AXIOM execution model and the related memory memory model is further detailed. The memory model is key for the execution of threads while reducing the need of data transfers. The preliminary results confirm the scalability of this model},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
The AXIOM project aims at designing such systems by focusing on low-cost Single Board Computers (SBC), based on current System-on-Chips (SoC) that include both programmable logic (FPGA), multi-core CPUs, accelerators and peripherals. A dataflow execution model, partially developed in the TERAFLUX project, brings a more predictable and reliable execution.
The goals of AXIOM include: i) the possibility to easily program the system with a shared-memory model based on OmpSs; ii) the possibility of scaling up the system through a custom but inexpensive interconnect; iii) the possibility of accelerating a specific function on a single or multiple FPGAs of the system.
The dataflow execution model operates at thread-level granularity. In this paper the AXIOM execution model and the related memory memory model is further detailed. The memory model is key for the execution of threads while reducing the need of data transfers. The preliminary results confirm the scalability of this model
Scordino, Claudio; Morelli, Bruno
Sharing memory in modern distributed applications Proceedings
2016, ISBN: 978-1-4503-3739-7.
@proceedings{Scordino2016,
title = {Sharing memory in modern distributed applications},
author = {Claudio Scordino and Bruno Morelli},
doi = {DOI:10.1145/2851613.2851950},
isbn = {978-1-4503-3739-7},
year = {2016},
date = {2016-04-04},
booktitle = {31st Annual ACM Symposium on Applied Computing},
pages = {1918-1921 },
abstract = {Traditionally, research on software distributed shared memory has been focused on optimizing the memory consistency models and the coherence protocols rather than the underlying run-time mechanisms. Thus, these systems have been often implemented on top of the paging functionalities offered by the operating system. This approach, however, introduces performance issues due to false sharing of data.
We propose an object-based approach that leverages the features of modern object-oriented programming to intercept single operations on data, hiding the underlying run-time mechanism. A possible implementation using the standard C++ programming language is shown and discussed.},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
We propose an object-based approach that leverages the features of modern object-oriented programming to intercept single operations on data, hiding the underlying run-time mechanism. A possible implementation using the standard C++ programming language is shown and discussed.
Verdoscia, Lorenzo; Giorgi, Roberto
A Data-Flow Soft-Core Processor for Accelerating Scientific Calculation on FPGAs Journal Article
In: Mathematical Problems in Engineering, vol. 2016, no. 1, pp. 1-21, 2016, ISSN: 1563-5147.
@article{Verdoscia2016,
title = {A Data-Flow Soft-Core Processor for Accelerating Scientific Calculation on FPGAs},
author = {Verdoscia, Lorenzo and Giorgi, Roberto},
url = {http://www.hindawi.com/journals/mpe/2016/3190234/cta/},
doi = {10.1155/2016/3190234},
issn = {1563-5147},
year = {2016},
date = {2016-04-01},
journal = {Mathematical Problems in Engineering},
volume = {2016},
number = {1},
pages = {1-21},
abstract = {We present a new type of soft-core processor called the “Data-Flow Soft-Core” that can be implemented through FPGA technology with adequate interconnect resources. This processor provides data processing based on data-flow instructions rather than control flow instructions. As a result, during an execution on the accelerator of the Data-Flow Soft-Core, both partial data and instructions are eliminated as traffic for load and store activities. Data-flow instructions serve to describe a program and to dynamically change the context of a data-flow program graph inside the accelerator, on-the-fly. Our proposed design aims at combining the performance of a fine-grained data-flow architecture with the flexibility of reconfiguration, without requiring a partial reconfiguration or new bit-stream for reprogramming it. The potential of the data-flow implementation of a function or functional program can be exploited simply by relying on its description through the data-flow instructions that reprogram the Data-Flow Soft-Core. Moreover, the data streaming process will mirror those present in other FPGA applications. Finally, we show the advantages of this approach by presenting two test cases and providing the quantitative and numerical results of our evaluations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Burgio, Paolo; Alvarez, Carlos; Ayguadé, Eduard; Filgueras, Antonio; Jiménez-González, Daniel; Martorell, Xavier; Navarro, Nacho; Giorgi, Roberto
Simulating next-generation Cyber-physical computing platforms Journal Article
In: Ada User Journal, vol. 37, no. 1, pp. 59-63, 2016, ISSN: 1381-6551, (TO APPEAR).
@article{Burgio2016,
title = {Simulating next-generation Cyber-physical computing platforms},
author = {Paolo Burgio and Carlos Alvarez and Eduard Ayguad\'{e} and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Xavier Martorell and Nacho Navarro and Roberto Giorgi},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84974555745\&partnerID=40\&md5=934acb4936e9317c382f4ce6fdce40f1},
issn = {1381-6551},
year = {2016},
date = {2016-03-01},
journal = {Ada User Journal},
volume = {37},
number = {1},
pages = {59-63},
abstract = {In specific domains, such as cyber-physical systems, platforms are quickly evolving to include multiple (many-) cores and programmable logic in a single system-on-chip, while including interfaces to commodity sensors/actuators. Programmable
Logic (e.g., FPGA) allows for greater flexibility and dependability. However, the task of extracting the performance/watt potential of heterogeneous many-cores is often demanded at the application level, and this has strong implication on the HW/SW co-design process. Enabling fast prototyping of a board being designed is paramount to enable low time-to-market for applications running on it, and ultimately, for the whole platform: programmers must be provided with accurate hardware models, to support the software development cycle at the very early stages of the design process. Virtual platforms fulfill this need, providing that they can be in turn efficiently developed and tested in a few months timespan. In this position paper we will share our experience in the sphere of the AXIOM project, identifying key properties that virtual platforms modeling next-generation cyber-physical systems should have to quickly enable simulation-based software development for a these platforms. },
note = {TO APPEAR},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Logic (e.g., FPGA) allows for greater flexibility and dependability. However, the task of extracting the performance/watt potential of heterogeneous many-cores is often demanded at the application level, and this has strong implication on the HW/SW co-design process. Enabling fast prototyping of a board being designed is paramount to enable low time-to-market for applications running on it, and ultimately, for the whole platform: programmers must be provided with accurate hardware models, to support the software development cycle at the very early stages of the design process. Virtual platforms fulfill this need, providing that they can be in turn efficiently developed and tested in a few months timespan. In this position paper we will share our experience in the sphere of the AXIOM project, identifying key properties that virtual platforms modeling next-generation cyber-physical systems should have to quickly enable simulation-based software development for a these platforms.
Mazumdar, Somnath; Giorgi, Roberto
A Survey on Hardware and Software Support for Thread Level Parallelism Journal Article
In: 2016.
@article{Mazumdar2016b,
title = {A Survey on Hardware and Software Support for Thread Level Parallelism},
author = {Somnath Mazumdar and Roberto Giorgi},
url = {https://arxiv.org/abs/1603.09274},
year = {2016},
date = {2016-03-01},
abstract = {To support growing massive parallelism, functional components and also the capabilities of current processors are changing and continue to do so. Todays computers are built upon multiple processing cores and run applications consisting of a large number of threads, making runtime thread management a complex process. Further, each core can support multiple, concurrent thread execution. Hence, hardware and software support for threads is more and more needed to improve peak-performance capacity, overall system throughput, and has therefore been the subject of much research. This paper surveys, many of the proposed or currently available solutions for executing, distributing and managing threads both in hardware and software. The nature of current applications is diverse. To increase the system performance, all programming models may not be suitable to harness the built-in massive parallelism of multicore processors. Due to the heterogeneity in hardware, hybrid programming model (which combines the features of shared and distributed model) currently has become very promising. In this paper, first, we have given an overview of threads, threading mechanisms and its management issues during execution. Next, we discuss about different parallel programming models considering to their explicit thread support. We also review the programming models with respect to their support to shared-memory, distributed-memory and heterogeneity. Hardware support at execution time is very crucial to the performance of the system, thus different types of hardware support for threads also exist or have been proposed, primarily based on widely used programming models. We also further discuss on software support for threads, to mainly increase the deterministic behavior during runtime. Finally, we conclude the paper by discussing some common issues related to the thread management.
A Survey on Hardware and Software Support for Thread Level Parallelism | Request PDF. Available from: https://www.researchgate.net/publication/301879025_A_Survey_on_Hardware_and_Software_Support_for_Thread_Level_Parallelism [accessed Feb 19 2018].},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
A Survey on Hardware and Software Support for Thread Level Parallelism | Request PDF. Available from: https://www.researchgate.net/publication/301879025_A_Survey_on_Hardware_and_Software_Support_for_Thread_Level_Parallelism [accessed Feb 19 2018].
2015
Giorgi, R.; Scionti, A.
A scalable thread scheduling co-processor based on data-flow principles Journal Article
In: vol. 53, pp. pp. 100–108, 2015, ISSN: 0167-739X.
@article{Giorgi15a.bib,
title = {A scalable thread scheduling co-processor based on data-flow principles},
author = {R. Giorgi and A. Scionti},
url = {http://www.sciencedirect.com/science/article/pii/S0167739X1400274X},
doi = {10.1016/j.future.2014.12.014},
issn = {0167-739X},
year = {2015},
date = {2015-12-01},
volume = {53},
pages = {pp. 100\textendash108},
abstract = {Large synchronization and communication overhead will become a major concern in future extreme-scale machines (e.g., {HPC} systems, supercomputers). These systems will push upwards performance limits by adopting chips equipped with one order of magnitude more cores than today. Alternative execution models can be explored in order to exploit the high parallelism offered by future massive many-core chips. This paper proposes the integration of standard cores with dedicated co-processing units that enable the system to support a fine-grain data-flow execution model developed within the {TERAFLUX} project. An instruction set architecture extension for supporting fine-grain thread scheduling and execution is proposed. This instruction set extension is supported by the co-processor that provides hardware units for accelerating thread scheduling and distribution among the available cores. Two fundamental aspects are at the base of the proposed system: the programmers can adopt their preferred programming model, and the compilation tools can produce a large set of threads mainly communicating in a producer\~{A}¢\^{a}‚¬\^{a}€\oeconsumer fashion, hence enabling data-flow execution. Experimental results demonstrate the feasibility of the proposed approach and its capability of scaling with the increasing number of cores.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto
Scalable Embedded Systems: Towards the Convergence of High-Performance and Embedded Computing Proceedings Article
In: Proceedings of the 13th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing (EUC 2015), 2015.
@inproceedings{Giorgi15d,
title = {Scalable Embedded Systems: Towards the Convergence of High-Performance and Embedded Computing},
author = {Roberto Giorgi},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/EUC15.pdf},
year = {2015},
date = {2015-10-20},
booktitle = {Proceedings of the 13th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing (EUC 2015)},
abstract = {Embedded System toolchains are highly customized for a specific System-on-Chip (SoC). When the application needs more performance, the designer is typically forced to adopt a new SoC and possibly another toolchain. The rationale for not scaling performance by using, e.g., two SoCs, is that maintining most of the operations on-chip may allow for higher energy efficiency. We are exploring the feasibility and trade-offs of designing and manufacturing a new Single Board Computer (SBC) that could serve flexibly for a number of current and future applications, by allowing scalability through clusters of SBCs while keeping the same programming model for the SBC. This board is based on FPGAs and embedded processors, and its key points are: i) a fast custom interconnect for board-to-board communication and ii) an easily programmable environment which would allow both the off-loading of code into accelerators (either soft-IP blocks or hard-IP blocks) and, at the same time, the distribution of computation across boards. A key challenge to successfully deploying this paradigm is to properly distribute the threads across several boards without the explicit intervention of the programmer. In this paper we describe how to dynamically and efficiently distribute the computational threads in symbiosis with an appropriate memory model to allow the system scalability, so that we can double the performance by simply connecting two boards without i) changing the basic hardware components (e.g., to a different System-On-Chip) and ii) changing the programming model to follow the vendor specific toolchain. Our approach is to reduce data movement across boards. Our initial experiments have confirmed the feasibility of our approach.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jimenez-Gonzalez, Daniel; Alvarez-Martinez, Carlos; Filgueras, Antonio; Martorell, Xavier; Langer, Jan; Noguera, Juanjo; Vissers, Kees
Coarse-Grain Performance Estimator for Heterogeneous Parallel Computing Architectures like Zynq All-Programmable SoC Journal Article
In: Second International Workshop on FPGAs for Software Programmers FSP 2015, vol. CoRR, 2015.
@article{Jimenez-Gonzalez2015,
title = {Coarse-Grain Performance Estimator for Heterogeneous Parallel Computing Architectures like Zynq All-Programmable SoC},
author = {Daniel Jimenez-Gonzalez and Carlos Alvarez-Martinez and Antonio Filgueras and Xavier Martorell and Jan Langer and Juanjo Noguera and Kees Vissers},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/Coarse-Grain-Performance-Estimator-for-2.pdf},
year = {2015},
date = {2015-08-27},
journal = {Second International Workshop on FPGAs for Software Programmers FSP 2015},
volume = {CoRR},
abstract = {Heterogeneous computing is emerging as a mandatory requirement for power-efficient system design. With this aim, modern heterogeneous platforms like Zynq All-Programmable SoC, that integrates ARM-based SMP and programmable logic, have been designed. However, those platforms introduce large design cycles consisting on hardware/software partitioning, decisions on granularity and number of hardware accelerators, hardware/software integration, bitstream generation, etc.
This paper presents a performance parallel heterogeneous estimation for systems where hardware/software co-design and run-time heterogeneous task scheduling are key. The results show that the programmer can quickly decide, based only on her/his OmpSs (OpenMP + extensions) application, which is the co-design that achieves nearly optimal heterogeneous parallel performance, based on the methodology presented and considering only synthesis estimation results. The methodology presented reduces the programmer co-design decision from hours to minutes and shows high potential on hardware/software heterogeneous parallel performance estimation on the Zynq All-Programmable SoC.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
This paper presents a performance parallel heterogeneous estimation for systems where hardware/software co-design and run-time heterogeneous task scheduling are key. The results show that the programmer can quickly decide, based only on her/his OmpSs (OpenMP + extensions) application, which is the co-design that achieves nearly optimal heterogeneous parallel performance, based on the methodology presented and considering only synthesis estimation results. The methodology presented reduces the programmer co-design decision from hours to minutes and shows high potential on hardware/software heterogeneous parallel performance estimation on the Zynq All-Programmable SoC.
Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Theodoropoulos, Dimitris; Pnevmatikatos, Dionisios; Catani, Davide; Scordino, Claudio; Gai, Paolo; Segura, Carlos; Fernandez, Carles; Oro, David; Rodriguez-Saeta, Javier; Passera, Pierluigi; Pomella, Alberto; Rizzo, Antonio; Giorgi, Roberto
The AXIOM Software Layers Journal Article
In: DSD 2015, 18th Euromicro Conference on Digital Systems Design (DSD), 2015.
@article{Alvarez2015,
title = {The AXIOM Software Layers},
author = {Carlos Alvarez and Eduard Ayguade and Javier Bueno and Antonio Filgueras and Daniel Jimenez-Gonzalez and Xavier Martorell and Nacho Navarro and Dimitris Theodoropoulos and Dionisios Pnevmatikatos and Davide Catani and Claudio Scordino and Paolo Gai and Carlos Segura and Carles Fernandez and David Oro and Javier Rodriguez-Saeta and Pierluigi Passera and Alberto Pomella and Antonio Rizzo and Roberto Giorgi},
doi = {0.1109/DSD.2015.52},
year = {2015},
date = {2015-08-26},
journal = {DSD 2015, 18th Euromicro Conference on Digital Systems Design (DSD)},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Mondelli, Andrea; Ho, Nam; Scionti, Alberto; Solinas, Marco; Portero, Antoni; Giorgi, Roberto
Dataflow Support in x86_64 Multicore Architectures through Small Hardware Extensions Conference
2015.
@conference{DBLP:conf/dsd/MondelliHSSPG15,
title = {Dataflow Support in x86_64 Multicore Architectures through Small Hardware Extensions},
author = {Andrea Mondelli and Nam Ho and Alberto Scionti and Marco Solinas and Antoni Portero and Roberto Giorgi},
url = {https://pdfs.semanticscholar.org/5ead/bbc3f37eb79e0251d1f99a0a4c9c1bb169c0.pdf},
doi = {10.1109/DSD.2015.62},
year = {2015},
date = {2015-08-26},
abstract = {The path towards future high performance computers requires architectures able to efficiently run ulti-threaded applications. In this context, dataflow-based execution models can improve the performance by limiting the synchronization overhead, thanks to a simple producer-consumer approach. This paper advocates the ISE of standard cores with a small hardware extension for efficiently scheduling the execution of threads on the basis of dataflow principles. A set of dedicated instructions allow the code to interact with the scheduler. Experimental results demonstrate that, the combination of dedicated scheduling units and a dataflow execution model improve the performance when compared with other techniques for code parallelization (e.g. OpenMP, Cilk).},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Theodoropoulos, Dimitris; Pnevmatikatos, Dionisis; Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Segura, Carlos; Fernandez, Carles; Oro, David; Saeta, Javier Rodriguez; Gai, Paolo; Rizzo, Antonio; Giorgi, Roberto
The AXIOM project (Agile, eXtensible, fast I/O Module) Journal Article
In: International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation - SAMOS XV 2015, 2015.
@article{Theodoropoulos2015,
title = {The AXIOM project (Agile, eXtensible, fast I/O Module)},
author = {Dimitris Theodoropoulos and Dionisis Pnevmatikatos and Carlos Alvarez and Eduard Ayguade and Javier Bueno and Antonio Filgueras and Daniel Jimenez-Gonzalez and Xavier Martorell and Nacho Navarro and Carlos Segura and Carles Fernandez and David Oro and Javier Rodriguez Saeta and Paolo Gai and Antonio Rizzo and Roberto Giorgi},
url = {http://samos-conference.com/Resources_Samos_Websites/Proceedings_Repository_SAMOS/2015/Files/SS0_03.pdf},
year = {2015},
date = {2015-07-21},
journal = {International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation - SAMOS XV 2015},
abstract = {The AXIOM project (Agile, eXtensible, fast I/O Module) aims at researching new software/hardware architectures for the future Cyber-Physical Systems (CPSs). These systems are expected to react in real-time, provide enough computational power for the assigned tasks, consume the least possible energy for such task (energy efficiency), scale up through modularity, allow for an easy programmability across performance scaling, and exploit at best existing standards at minimal costs. Current solutions for providing enough computational power are mainly based on multi- or many-core architectures. For example, some current research projects (such as ADEPT or PSOCRATES) are already investigating how to join efforts from the High-Performance Computing (HPC) and the Embedded Computing domains, which are both focused on high power efficiency, while GPUs and new Dataflow platforms such as Maxeler, or in general FPGAs, are claimed as the most energy efficient. We present the project’s initial approach, ideas and key concepts, and describe the AXIOM preliminary architecture. Our starting point uses power efficient multi-core nodes, such as ARM cores and FPGA accelerators on the same die, as in the Xilinx Zynq. We will work to provide an integrated environment that supports programmability of the parallel, interconnected nodes that form a CPS system, and evaluate our ideas using demanding test application scenarios.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Burresi, Giovanni; Giorgi, Roberto
A Field Experience for a Vehicle Recognition System using Magnetic Sensors Proceedings Article
In: IEEE MECO 2015, pp. 178-181, 2015, ISBN: 978-1-4799-8999-7.
@inproceedings{Burresi15a.bib,
title = {A Field Experience for a Vehicle Recognition System using Magnetic Sensors},
author = {Burresi, Giovanni and Giorgi, Roberto},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/A-Field-Experience-for-a-Vehicle-Recognition-System-1.pdf},
doi = {10.1109/MECO.2015.7181897},
isbn = {978-1-4799-8999-7},
year = {2015},
date = {2015-06-14},
booktitle = {IEEE MECO 2015},
pages = {178-181},
abstract = {This paper describes the development and testing of a vehicle recognition prototype based on magnetic sensors. The aim of this research is to design a low cost, low power consumption and simple hardware platform for vehicle recognition. The goal is to recognize four types of vehicles (car, bus, mini-bus or camper) as they run over a set of magnetic sensors. We describe all steps for correct vehicle presence detection, pattern pre-processing, speed and length detection using a combination of an empirical and an analytical method for signal alignment. We collected a set of data regarding this types of vehicles and explain how to differentiate them. Our classification tests reach a confidence factor greater than 91%.},
keywords={Hardware;Magnetic fields;Magnetic flux;Magnetic hysteresis;Magnetic sensors;Vehicles;classification;cyber-physical systems;magnetic sensors;traffic monitoring;vehicle recognition},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
keywords={Hardware;Magnetic fields;Magnetic flux;Magnetic hysteresis;Magnetic sensors;Vehicles;classification;cyber-physical systems;magnetic sensors;traffic monitoring;vehicle recognition
Verdoscia, Lorenzo; Vaccaro, Roberto; Giorgi, Roberto
A matrix multiplier case study for an evaluation of a configurable Dataflow-Machine Proceedings Article
In: ACM CF'15 - LP-EMS, pp. 1-6, 2015, ISBN: 978-1-4503-3358-0.
@inproceedings{Verdoscia15a.bib,
title = {A matrix multiplier case study for an evaluation of a configurable Dataflow-Machine},
author = {Lorenzo Verdoscia and Roberto Vaccaro and Roberto Giorgi},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/A-matrix-multiplier-case-study-for-an-evaluation-of-a-configurable-Dataflow-machine.pdf},
doi = {10.1145/2742854.2747287},
isbn = {978-1-4503-3358-0},
year = {2015},
date = {2015-05-18},
booktitle = {ACM CF'15 - LP-EMS},
pages = {1-6},
abstract = {Configurable computing has become a subject of a great deal of research given its potential to greatly accelerate a wide variety of applications that require high throughput. In this context, the dataflow approach is still promising to accelerate the kernel of applications in the field of HPC. That tanks to a computational dataflow engine able to execute dataflow program graphs directly in a custom hardware. On the other hand, evaluating radically different models of computation remains yet an open issue. In this paper we present as case study the matrix multiplication that constitutes the fundamental kernel of the linear algebra. The evaluation takes into account the execution of the matrix product both in non-pipelined and pipelined modes. Results obtained running the execution of the two modes on an FPGA-based demonstrator show the validity of the configurable Dataflow-Machine. Moreover, at the same throughput, the power consumption is expected to be lower than in clock-based systems.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Mondelli, Andrea; Ho, Nam; Scionti, Alberto; Solinas, Marco; Portero, Antoni; Giorgi, Roberto
Enhancing an x86_64 Multi-Core Architecture with Data-Flow Execution Support Proceedings Article
In: Article, ACM 2015 (Ed.): 2015, ISBN: 978-1-4503-3358-0.
@inproceedings{10.1145/2742854.2742896,
title = {Enhancing an x86_64 Multi-Core Architecture with Data-Flow Execution Support},
author = {Andrea Mondelli and Nam Ho and Alberto Scionti and Marco Solinas and Antoni Portero and Roberto Giorgi},
editor = {ACM 2015 Article},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/Enhancing-an-x86_64-Multi-Core-Architecture-with-1.pdf},
doi = {10.1145/2742854.2742896},
isbn = {978-1-4503-3358-0},
year = {2015},
date = {2015-05-06},
abstract = {Future exascale machines will require multi/many-core architectures able to energyciently run multi-threaded applications.
Data-flow execution models have demonstrated to be capable of improving execution performance by limiting the synchronization overhead. This paper proposes to augment cores with a minimalistic set of hardware units and dedicated instructions that allow energyciently scheduling the execution of threads on the basis of data-flow principles. Experimental results show performance improvements of the system when compared with other techniques (e.g., OpenMP, Cilk).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Data-flow execution models have demonstrated to be capable of improving execution performance by limiting the synchronization overhead. This paper proposes to augment cores with a minimalistic set of hardware units and dedicated instructions that allow energyciently scheduling the execution of threads on the basis of data-flow principles. Experimental results show performance improvements of the system when compared with other techniques (e.g., OpenMP, Cilk).
Giorgi, Roberto
Transactional Memory on a Dataflow Architecture for Accelerating Haskell Journal Article
In: WSEAS Transactions on Computers, vol. 14, pp. 546-558, 2015, ISSN: 1109-2750.
@article{Giorgi15c.bib,
title = {Transactional Memory on a Dataflow Architecture for Accelerating Haskell},
author = {Roberto Giorgi},
url = {http://www.wseas.org/multimedia/journals/computers/2015/b085805-099.pdf},
issn = {1109-2750},
year = {2015},
date = {2015-02-22},
journal = {WSEAS Transactions on Computers},
volume = {14},
pages = {546-558},
abstract = {Dataflow Architectures have been explored extensively in the past and are now re-evaluated from a different
perspective as they can provide a viable solution to efficiently exploit multi/many core chips. Indeed, the
dataflow paradigm provides an elegant solution to distribute the computations on the available cores by starting
computations based on the availability of their input data.
In this paper, we refer to the DTA (Decoupled Threaded Architecture) \textendash which relies on a dataflow execution model
\textendash to show how Haskell could benefit from an architecture that matches the functional nature of that language. A
compilation toolchain based on the so called External Core \textendash an intermediate representation used by Haskell \textendash has
been implemented for most common data types and operations and in particular to support concurrent paradigms
(e.g. MVars, ForkIO) and Transactional Memory (TM).
We performed initial experiments to understand the efficiency of our code both against hand-coded DTA programs
and against GHC generated code for the x86 architecture. Moreover we analyzed the performance of a simple
shared-counter benchmark that is using TM in Haskell in both DTA and x86. The results of these experiments
clearly show a great potential for accelerating Haskell: for example the number of dynamically executed instructions
can be more than one order of magnitude lower in case of Haskell+DTA compared to x86. Also the number
of memory accesses is drastically reduced in DTA.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
perspective as they can provide a viable solution to efficiently exploit multi/many core chips. Indeed, the
dataflow paradigm provides an elegant solution to distribute the computations on the available cores by starting
computations based on the availability of their input data.
In this paper, we refer to the DTA (Decoupled Threaded Architecture) – which relies on a dataflow execution model
– to show how Haskell could benefit from an architecture that matches the functional nature of that language. A
compilation toolchain based on the so called External Core – an intermediate representation used by Haskell – has
been implemented for most common data types and operations and in particular to support concurrent paradigms
(e.g. MVars, ForkIO) and Transactional Memory (TM).
We performed initial experiments to understand the efficiency of our code both against hand-coded DTA programs
and against GHC generated code for the x86 architecture. Moreover we analyzed the performance of a simple
shared-counter benchmark that is using TM in Haskell in both DTA and x86. The results of these experiments
clearly show a great potential for accelerating Haskell: for example the number of dynamically executed instructions
can be more than one order of magnitude lower in case of Haskell+DTA compared to x86. Also the number
of memory accesses is drastically reduced in DTA.
Giorgi, Roberto
Accelerating Haskell on a Dataflow Architecture: a case study including Transactional Memory Proceedings Article
In: Proc. Int.l Conf. on Computer Engineering and Applications (CEA), pp. 91–100, Dubai, UAE, 2015, ISBN: 978-1-61804-276-7.
@inproceedings{Giorgi15b.bib,
title = {Accelerating Haskell on a Dataflow Architecture: a case study including Transactional Memory},
author = {Roberto Giorgi},
url = {http://www.wseas.us/e-library/conferences/2015/Dubai/CEA/CEA-12.pdf},
isbn = {978-1-61804-276-7},
year = {2015},
date = {2015-02-22},
booktitle = {Proc. Int.l Conf. on Computer Engineering and Applications (CEA)},
pages = {91--100},
address = {Dubai, UAE},
abstract = {A possible direction for exploiting the computational power of multi/many core chips is to rely on a massive usage of Thread Level Parallelism (TLP). We focus on the Decoupled Threaded Architecture, a hybrid dataflow architecture which efficiently uses TLP by decoupling and scheduling threads on chip processing elements in order to provide on-chip scalable performance.
The DTA architecture currently lacks a specific mapping to high level languages. Our idea is to use a functional language to match this execution paradigm because we think it is very fit for this environment. We choose Haskell as our language and in particular one of the features we want to implement is the concurrency control based on Transactional Memory, which is fully supported in Haskell.
The main goal of this research is twofold. First, the study of a method to unite the functional paradigm of the Haskell programming language with the DTA execution paradigm. Second, the development of a Transactional Memory model for DTA architecture based on the STM (Software Transactional Memory) API.
To achieve this goals, we have implemented a tool chain, which translates simple programs from Haskell to DTA and created a first version of the Transactional Memory mechanism in DTA.
Our results show promising speedup of the Haskell based front-end for the DTA architecture.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
The DTA architecture currently lacks a specific mapping to high level languages. Our idea is to use a functional language to match this execution paradigm because we think it is very fit for this environment. We choose Haskell as our language and in particular one of the features we want to implement is the concurrency control based on Transactional Memory, which is fully supported in Haskell.
The main goal of this research is twofold. First, the study of a method to unite the functional paradigm of the Haskell programming language with the DTA execution paradigm. Second, the development of a Transactional Memory model for DTA architecture based on the STM (Software Transactional Memory) API.
To achieve this goals, we have implemented a tool chain, which translates simple programs from Haskell to DTA and created a first version of the Transactional Memory mechanism in DTA.
Our results show promising speedup of the Haskell based front-end for the DTA architecture.