@inproceedings {INPROC-2021-11,
   author = {Christoph Stach and Julia Br{\"a}cker and Rebecca Eichler and Corinna Giebler and Bernhard Mitschang},
   title = {{Demand-Driven Data Provisioning in Data Lakes: BARENTS - A Tailorable Data Preparation Zone}},
   booktitle = {Proceedings of the 23rd International Conference on Information Integration and Web-based Applications \& Services (iiWAS2021); Linz, Austria, November 29-December 1, 2021},
   editor = {Maria Indrawan-Santiago and Eric Pardede and Ivan Luiz Salvadori and Matthias Steinbauer and Ismail Khalil and Gabriele Kotsis},
   address = {New York, NY, United States},
   publisher = {Association for Computing Machinery (ACM)},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {1--12},
   type = {Conference Paper},
   month = {November},
   year = {2021},
   isbn = {978-1-4503-9556-4/21/11},
   doi = {10.1145/3487664.3487784},
   keywords = {data pre-processing; data transformation; knowledge modeling; ontology; data management; Data Lakes; zone model; food analysis},
   language = {English},
   cr-category = {H.2.7 Database Administration,
                   E.2 Data Storage Representations,
                   H.3.3 Information Search and Retrieval,
                   H.2.8 Database Applications},
   contact = {Senden Sie eine E-Mail an christoph.stach@ipvs.uni-stuttgart.de.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Data has never been as significant as it is today. It can be acquired virtually
      at will on any subject. Yet, this poses new challenges towards data management,
      especially in terms of storage (data is not consumed during processing, i.e.,
      the data volume keeps growing), flexibility (new applications emerge), and
      operability (analysts are no IT experts). The goal has to be a demand-driven
      data provisioning, i.e., the right data must be available in the right form at
      the right time. Therefore, we introduce a tailorable data preparation zone for
      Data Lakes called BARENTS. It enables users to model in an ontology how to
      derive information from data and assign the information to use cases. The data
      is automatically processed based on this model and the refined data is made
      available to the appropriate use cases. Here, we focus on a resource-efficient
      data management strategy. BARENTS can be embedded seamlessly into established
      Big Data infrastructures, e.g., Data Lakes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-11&amp;engl=1}
}

@inproceedings {INPROC-2021-10,
   author = {Alejandro Villanueva Zacarias and Christian Weber and Peter Reimann and Bernhard Mitschang},
   title = {{AssistML: A Concept to Recommend ML Solutions for Predictive Use Cases}},
   booktitle = {Proceedings of the 8th IEEE International Conference on Data Science and Advanced Analytics (DSAA 2021)},
   address = {Porto, Portugal},
   publisher = {IEEE},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   type = {Conference Paper},
   month = {October},
   year = {2021},
   keywords = {Recommender Systems; Machine Learning; Meta Learning},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The adoption of machine learning (ML) in organizations is characterized by the
      use of multiple ML software components. Citizen data scientists face practical
      requirements when building ML systems, which go beyond the known challenges of
      ML, e. g., data engineering or parameter optimization. They are expected to
      quickly identify ML system options that strike a suitable trade-off across
      multiple performance criteria. These options also need to be understandable for
      non-technical users. Addressing these practical requirements represents a
      problem for citizen data scientists with limited ML experience. This calls for
      a method to help them identify suitable ML software combinations. Related work,
      e. g., AutoML systems, are not responsive enough or cannot balance different
      performance criteria. In this paper, we introduce AssistML, a novel concept to
      recommend ML solutions, i. e., software systems with ML models, for predictive
      use cases. AssistML uses metadata of existing ML solutions to quickly identify
      and explain options for a new use case. We implement the approach and evaluate
      it with two exemplary use cases. Results show that AssistML proposes ML
      solutions that are in line with users{\^a}€™ performance preferences in seconds.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-10&amp;engl=1}
}

@inproceedings {INPROC-2021-09,
   author = {Eduard Wagner and Bernd Keller and Peter Reimann and Christoph Gr{\"o}ger and Dieter Spath},
   title = {{Advanced Analytics for Evaluating Critical Joining Technologies in Automotive Body Structures and Body Shops}},
   booktitle = {Proceedings of the 15th CIRP Conference on Intelligent Computation in Manufacturing Engineering (CIRP ICME)},
   publisher = {Elsevier},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   type = {Conference Paper},
   month = {July},
   year = {2021},
   keywords = {Body Shop; Data Analytics; Data Mining; Advanced Analytics; Machine Learning},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The product development process within the automotive industry is subject to
      changing demands due to internal and external influences. These influences and
      adjustments especially affect the car body and its inherent joining technology,
      as critical stages of variant creation. However, current literature does not
      offer a suitable analytical method to identify and assess these critical
      influences. We propose an advanced analytics approach that combines data mining
      and machine learning techniques within the car body substructure. The
      evaluation within the MercedesBenz AG shows that our approach facilitates a
      quantitative assessment of unknown interdependencies between car body modules
      and corresponding joining technique},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-09&amp;engl=1}
}

@inproceedings {INPROC-2021-08,
   author = {Alexander Birk and Yannick Wilhelm and Simon Dreher and Christian Flack and Peter Reimann and Christoph Gr{\"o}ger},
   title = {{A Real-World Application of Process Mining for Data-Driven Analysis of Multi-Level Interlinked Manufacturing Processes}},
   booktitle = {Procedia CIRP: Proceedings of the 54th CIRP Conference on Manufacturing Systems (CIRP CMS 2021)},
   publisher = {Elsevier},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   type = {Conference Paper},
   month = {September},
   year = {2021},
   keywords = {Process Mining; Multi-level Interlinked Manufacturing Process; Heterogeneous Data Sources; Data Integration},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Process Mining (PM) has huge potential for manufacturing process analysis.
      However, there is little research on practical applications. We investigate a
      real-world manufacturing process of pneumatic valves. The manufacturing process
      comprises interlinked events at the superordinate business process level and at
      the subordinate machine level, making its analysis based on PM challenging.We
      show how to integrate heterogeneous data sources and give examples how PM
      enables a deeper understanding of the manufacturing process, thereby helping to
      uncover optimization potentials. Furthermore, we discuss challenges in data
      integration and point out limitations of current PM techniques in
      manufacturing.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-08&amp;engl=1}
}

@inproceedings {INPROC-2021-07,
   author = {Julian Ziegler and Peter Reimann and Florian Keller and Bernhard Mitschang},
   title = {{A Metadata Model to Connect Isolated Data Silos and Activities of the CAE Domain}},
   booktitle = {Proceedings of the 33rd International Conference on Advanced Information Systems Engineering (CAiSE)},
   publisher = {Springer International Publishing},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {213--228},
   type = {Conference Paper},
   month = {June},
   year = {2021},
   keywords = {Metadata Models; Graphs; Computer-aided Engineering},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Computer-aided engineering (CAE) applications support the digital
      transformation of the manufacturing industry. They facilitate virtual product
      development and product testing via computer simulations. CAE applications
      generate vast quantities of heterogeneous data. Domain experts struggle to
      access and analyze them, because such engineering data are not sufficiently
      described with metadata. In this paper, we characterize the CAE domain and
      identify unsolved challenges for a tailored data and metadata management. For
      instance, work activities in product development projects and their
      relationships to data are not represented explicitly in current metadata
      models. We propose a metadata model that addresses all challenges and provides
      a connected view on all CAE data, metadata, and work activities of development
      projects. We validate the feasibility of our metadata model through a
      prototypical implementation and its application to a real-world use case. This
      verifies that our metadata model addresses the CAE-specific challenges and this
      way eases the task of domain experts to exploit relevant data.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-07&amp;engl=1}
}

@inproceedings {INPROC-2021-06,
   author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Holger Schwarz and Bernhard Mitschang},
   title = {{Enterprise-Wide Metadata Management - An Industry Case on the Current State and Challenges}},
   booktitle = {24thInternational Conference on Business Information Systems},
   editor = {Witold Abramowicz and S{\"o}ren Auer and Lewa\&\#324 and El\&\#380 Ska and Bieta},
   publisher = {TIB Open Publishing},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {269--279},
   type = {Conference Paper},
   month = {July},
   year = {2021},
   doi = {https://doi.org/10.52825/bis.v1i.47},
   language = {English},
   cr-category = {A.0 General Literature, General},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Abstract. Metadata management is a crucial success factor for companies today,
      as for example, it enables exploiting data value fully or enables legal
      compliance. With the emergence of new concepts, such as the data lake, and new
      objectives, such as the enterprise-wide sharing of data, metadata management
      has evolved and now poses a renewed challenge for companies. In this context,
      we interviewed a globally active manufacturer to reveal how metadata management
      is implemented in practice today and what challenges companies are faced with
      and whether these constitute research gaps. As an outcome, we present the
      company{\^a}€™s metadata management goals and their corresponding solution
      approaches and challenges. An evaluation of the challenges through a literature
      and tool review yields three research gaps, which are concerned with the
      topics: (1) metadata management for data lakes, (2) categorizations and
      compositions of metadata management tools for comprehensive metadata
      management, and (3) the use of data marketplaces as metadata-driven exchange
      platforms within an enterprise. The gaps lay the groundwork for further
      research activities in the field of metadata management and the industry case
      represents a starting point for research to realign with real-world industry
      needs.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-06&amp;engl=1}
}

@inproceedings {INPROC-2021-05,
   author = {Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Rebecca Eichler and Holger Schwarz and Bernhard Mitschang},
   title = {{The Data Lake Architecture Framework}},
   booktitle = {Datenbanksysteme f{\"u}r Business, Technologie und Web (BTW 2021), 19. Fachtagung des GI-Fachbereichs Datenbanken und Informationssysteme (DBIS), 13.-17. September 2021, Dresden, Germany},
   publisher = {Gesellschaft f{\"u}r Informatik},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {351--370},
   type = {Conference Paper},
   month = {September},
   year = {2021},
   doi = {10.18420/btw2021-19},
   language = {English},
   cr-category = {H.4 Information Systems Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {During recent years, data lakes emerged as a way to manage large amounts of
      heterogeneous data for modern data analytics. Although various work on
      individual aspects of data lakes exists, there is no comprehensive data lake
      architecture yet. Concepts that describe themselves as a {\^a}€śdata lake
      architecture{\^a}€ť are only partial. In this work, we introduce the data lake
      architecture framework. It supports the definition of data lake architectures
      by defining nine architectural aspects, i.e., perspectives on a data lake, such
      as data storage or data modeling, and by exploring the interdependencies
      between these aspects. The included methodology helps to choose appropriate
      concepts to instantiate each aspect. To evaluate the framework, we use it to
      configure an exemplary data lake architecture for a real-world data lake
      implementation. This final assessment shows that our framework provides
      comprehensive guidance in the configuration of a data lake architecture.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-05&amp;engl=1}
}

@inproceedings {INPROC-2021-04,
   author = {Manuel Fritz and Gang Shao and Holger Schwarz},
   title = {{Automatic Selection of Analytic Platforms with ASAP-DM}},
   booktitle = {Proceedings of the 33rd International Conference on Scientific and Statistical Database Management},
   publisher = {ACM},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {220--225},
   type = {Conference Paper},
   month = {July},
   year = {2021},
   isbn = {9781450384131},
   doi = {10.1145/3468791.3468802},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The plethora of available analytic platforms escalates the difficulty of
      selecting the most appropriate platform for a certain data mining task and
      datasets with varying characteristics. Especially novice analysts experience
      difficulties to keep up with the latest technical developments. In this demo,
      we present the ASAP-DM framework. ASAP-DM is able to automatically select a
      well-performing analytic platform for a given data mining task via an intuitive
      web interface, thus especially supporting novice analysts. The take-aways for
      demo attendees are: (1) a good understanding of the challenges of various data
      mining workloads, dataset characteristics, and the effects on the selection of
      analytic platforms, (2) useful insights on how ASAP-DM internally works, and
      (3) how to benefit from ASAP-DM for exploratory data analysis.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-04&amp;engl=1}
}

@inproceedings {INPROC-2021-03,
   author = {Dennis Tschechlov and Manuel Fritz and Holger Schwarz},
   title = {{AutoML4Clust: Efficient AutoML for Clustering Analyses}},
   booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
   publisher = {Online},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {1--6},
   type = {Conference Paper},
   month = {March},
   year = {2021},
   doi = {10.5441/002/EDBT.2021.32},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Data analysis is a highly iterative process. In order to achieve valuable
      analysis results, analysts typically execute many configurations, i.e.,
      algorithms and their hyperparameter settings, based on their domain knowledge.
      While experienced analysts may be able to define small search spaces for
      promising configurations, especially novice analysts define large search spaces
      due to their lack of domain knowledge. In the worst case, they perform an
      exhaustive search throughout the whole search space, resulting in infeasible
      runtimes. Recent advances in the research area of AutoML address this challenge
      by supporting novice analysts in the combined algorithm selection and
      hyperparameter optimization (CASH) problem for supervised learning tasks.
      However, no such systems exist for unsupervised learning tasks, such as the
      prevalent task of clustering analysis. In this work, we present our novel
      AutoML4Clust approach, which efficiently supports novice analysts regarding
      CASH for clustering analyses. To the best of our knowledge, this is the first
      thoroughly elaborated approach in this area. Our comprehensive evaluation
      unveils that AutoML4Clust significantly outperforms several existing
      approaches, as it achieves considerable speedups for the CASH problem, while
      still achieving very valuable clustering results.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-03&amp;engl=1}
}

@inproceedings {INPROC-2021-02,
   author = {Manuel Fritz and Dennis Tschechlov and Holger Schwarz},
   title = {{Efficient Exploratory Clustering Analyses with Qualitative Approximations}},
   booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
   publisher = {Online},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {1--6},
   type = {Conference Paper},
   month = {March},
   year = {2021},
   doi = {10.5441/002/EDBT.2021.31},
   language = {English},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Clustering is a fundamental primitive for exploratory data analyses. Yet,
      finding valuable clustering results for previously unseen datasets is a pivotal
      challenge. Analysts as well as automated exploration methods often perform an
      exploratory clustering analysis, i.e., they repeatedly execute a clustering
      algorithm with varying parameters until valuable results can be found. k-center
      clustering algorithms, such as k-Means, are commonly used in such exploratory
      processes. However, in the worst case, each single execution of k-Means
      requires a super-polynomial runtime, making the overall exploratory process on
      voluminous datasets infeasible in a reasonable time frame. We propose a novel
      and efficient approach for approximating results of k-center clustering
      algorithms, thus supporting analysts in an ad-hoc exploratory process for
      valuable clustering results. Our evaluation on an Apache Spark cluster unveils
      that our approach significantly outperforms the regular execution of a k-center
      clustering algorithm by several orders of magnitude in runtime with a
      predefinable qualitative demand. Hence, our approach is a strong fit for
      clustering voluminous datasets in exploratory settings.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-02&amp;engl=1}
}

@article {ART-2021-05,
   author = {Manuel Fritz and Michael Behringer and Dennis Tschechlov and Holger Schwarz},
   title = {{Efficient exploratory clustering analyses in large-scale exploration processes}},
   journal = {The VLDB Journal},
   editor = {Georgia Koutrika and Ren{\'e}e J. Miller and Kyuseok Shim},
   address = {Berlin, Heidelberg},
   publisher = {Springer Berlin Heidelberg},
   pages = {1--22},
   type = {Article in Journal},
   month = {November},
   year = {2021},
   doi = {10.1007/s00778-021-00716-y},
   issn = {1066-8888},
   keywords = {Exploratory clustering analysis; Exploration; Clustering; Centroid-based clustering},
   language = {German},
   cr-category = {H.3.3 Information Search and Retrieval},
   contact = {Senden Sie eine E-Mail an manuel.fritz@ipvs.uni-stuttgart.de.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Clustering is a fundamental primitive in manifold applications. In order to
      achieve valuable results in exploratory clustering analyses, parameters of the
      clustering algorithm have to be set appropriately, which is a tremendous
      pitfall. We observe multiple challenges for large-scale exploration processes.
      On the one hand, they require specific methods to efficiently explore large
      parameter search spaces. On the other hand, they often exhibit large runtimes,
      in particular when large datasets are analyzed using clustering algorithms with
      super-polynomial runtimes, which repeatedly need to be executed within
      exploratory clustering analyses. We address these challenges as follows: First,
      we present LOG-Means and show that it provides estimates for the number of
      clusters in sublinear time regarding the defined search space, i.e., provably
      requiring less executions of a clustering algorithm than existing methods.
      Second, we demonstrate how to exploit fundamental characteristics of
      exploratory clustering analyses in order to significantly accelerate the
      (repetitive) execution of clustering algorithms on large datasets. Third, we
      show how these challenges can be tackled at the same time. To the best of our
      knowledge, this is the first work which simultaneously addresses the
      above-mentioned challenges. In our comprehensive evaluation, we unveil that our
      proposed methods significantly outperform state-of-the-art methods, thus
      especially supporting novice analysts for exploratory clustering analyses in
      large-scale exploration processes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-05&amp;engl=1}
}

@article {ART-2021-04,
   author = {Dennis Przytarski and Christoph Stach and Cl{\'e}mentine Gritti and Bernhard Mitschang},
   title = {{Query Processing in Blockchain Systems: Current State and Future Challenges}},
   journal = {Future Internet},
   editor = {Dino Giuli and Andrew Hudson-Smith and Luis Javier Garcia Villalba},
   publisher = {MDPI},
   volume = {14},
   number = {1},
   pages = {1--31},
   type = {Article in Journal},
   month = {December},
   year = {2021},
   issn = {1999-5903},
   doi = {10.3390/fi14010001},
   keywords = {blockchain systems; query processing; data models; data structures; block structures},
   language = {English},
   cr-category = {H.3.0 Information Storage and Retrieval General,
                   H.3.3 Information Search and Retrieval},
   contact = {Senden Sie eine E-Mail an Dennis.Przytarski@ipvs.uni-stuttgart.de.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {When, in 2008, Satoshi Nakamoto envisioned the first distributed database
      management system that relied on cryptographically secured chain of blocks to
      store data in an immutable and tamper-resistant manner, his primary use case
      was the introduction of a digital currency. Owing to this use case, the
      blockchain system was geared towards efficient storage of data, whereas the
      processing of complex queries, such as provenance analyses of data history, is
      out of focus. The increasing use of Internet of Things technologies and the
      resulting digitization in many domains, however, have led to a plethora of
      novel use cases for a secure digital ledger. For instance, in the healthcare
      sector, blockchain systems are used for the secure storage and sharing of
      electronic health records, while the food industry applies such systems to
      enable a reliable food-chain traceability, e.g., to prove compliance with cold
      chains. In these application domains, however, querying the current state is
      not sufficient - comprehensive history queries are required instead. Due to
      these altered usage modes involving more complex query types, it is
      questionable whether today's blockchain systems are prepared for this type of
      usage and whether such queries can be processed efficiently by them. In our
      paper, we therefore investigate novel use cases for blockchain systems and
      elicit their requirements towards a data store in terms of query capabilities.
      We reflect the state of the art in terms of query support in blockchain systems
      and assess whether it is capable of meeting the requirements of such more
      sophisticated use cases. As a result, we identify future research challenges
      with regard to query processing in blockchain systems.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-04&amp;engl=1}
}

@article {ART-2021-03,
   author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
   title = {{Modeling metadata in data lakes—A generic model}},
   journal = {Data \& Knowledge Engineering},
   publisher = {Elsevier},
   volume = {136},
   pages = {1--17},
   type = {Article in Journal},
   month = {November},
   year = {2021},
   issn = {0169-023X},
   doi = {10.1016/j.datak.2021.101931},
   keywords = {Metadata management; Metadata model; Data lake; Data management; Data lake zones; Metadata classification},
   language = {English},
   cr-category = {H.2 Database Management},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Data contains important knowledge and has the potential to provide new
      insights. Due to new technological developments such as the Internet of Things,
      data is generated in increasing volumes. In order to deal with these data
      volumes and extract the data{\^a}{\^a}‚¬{\^a}„˘s value new concepts such as the data lake
      were created. The data lake is a data management platform designed to handle
      data at scale for analytical purposes. To prevent a data lake from becoming
      inoperable and turning into a data swamp, metadata management is needed. To
      store and handle metadata, a generic metadata model is required that can
      reflect metadata of any potential metadata management use case, e.g., data
      versioning or data lineage. However, an evaluation of existent metadata models
      yields that none so far are sufficiently generic as their design basis is not
      suited. In this work, we use a different design approach to build HANDLE, a
      generic metadata model for data lakes. The new metadata model supports the
      acquisition of metadata on varying granular levels, any metadata
      categorization, including the acquisition of both metadata that belongs to a
      specific data element as well as metadata that applies to a broader range of
      data. HANDLE supports the flexible integration of metadata and can reflect the
      same metadata in various ways according to the intended utilization.
      Furthermore, it is created for data lakes and therefore also supports data lake
      characteristics like data lake zones. With these capabilities HANDLE enables
      comprehensive metadata management in data lakes. HANDLE{\^a}{\^a}‚¬{\^a}„˘s feasibility is
      shown through the application to an exemplary access-use-case and a
      prototypical implementation. By comparing HANDLE with existing models we
      demonstrate that it can provide the same information as the other models as
      well as adding further capabilities needed for metadata management in data
      lakes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-03&amp;engl=1}
}

@inbook {INBOOK-2021-01,
   author = {Dimitri Petrik and Mathias Mormul and Peter Reimann and Christoph Gr{\"o}ger},
   title = {{Anforderungen f{\"u}r Zeitreihendatenbanken im industriellen IoT}},
   series = {IoT – Best Practices},
   publisher = {Springer-Verlag},
   pages = {339--377},
   type = {Article in Book},
   month = {May},
   year = {2021},
   keywords = {Zeitreihendaten; Zeitreihendatenbanken; Industrial IoT; Edge Computing; Data Lake; InfluxDB},
   language = {German},
   cr-category = {H.2.8 Database Applications},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Das industrielle Internet der Dinge (IIoT) integriert Informations- und
      Kommunikationstechnologien in industrielle Prozesse und erweitert sie durch
      Echtzeit-Datenanalyse. Hierbei sind sensorbasierte Zeitreihen ein wesentlicher
      Typ von Daten, die in der industriellen Fertigung generiert werden.
      Sensorbasierte Zeitreihendaten werden in regelm{\"a}{\ss}igen Abst{\"a}nden generiert
      und enthalten zus{\"a}tzlich zum Sensorwert einen Zeitstempel. Spezielle
      Zeitreihen-Datenbanken (eng.: Time Series Databases (TSDB)) sind daf{\"u}r
      ausgelegt, Zeitreihendaten effizient zu speichern. Wenn TSDBs maschinennah, d.
      h. in der industriellen Edge, eingesetzt werden, sind Maschinendaten zur
      {\"U}berwachung zeitkritischer Prozesse aufgrund der niedrigen Latenz schnell
      verf{\"u}gbar, was die erforderliche Zeit f{\"u}r die Datenverarbeitung reduziert.
      Andererseits k{\"o}nnen TSDBs auch in den Data Lakes als skalierbaren
      Datenplattformen zur Speicherung und Analyse von Rohdaten zum Einsatz kommen,
      um die langfristige Vorhaltung von Zeitreihendaten zu erm{\"o}glichen. Bisherige
      Untersuchungen zu TSDBs sind bei der Auswahl f{\"u}r den Einsatz in der
      industriellen Edge und im Data Lake nicht vorhanden. Die meisten verf{\"u}gbaren
      Benchmarks von TSDBs sind performanceorientiert und ber{\"u}cksichtigen nicht die
      Randbedingungen einer industriellen Edge oder eines Data Lake. Wir adressieren
      diese L{\"u}cke und identifizieren funktionale Kriterien f{\"u}r den Einsatz von
      TSDBs in diesen beiden Umgebungen und bilden somit einen qualitativen
      Kriterienkatalog. Des Weiteren zeigen wir am Beispiel von InfluxDB, wie dieser
      Katalog verwendet werden kann, mit dem Ziel die systematische Auswahl einer
      passenden TSDB f{\"u}r den Einsatz in der Edge und im Data Lake zu unterst{\"u}tzen.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INBOOK-2021-01&amp;engl=1}
}