@misc{daehlmann2024towards, Author = {Klaas Dählmann and Timo Wolters and Stefan Meisegeier and Christian Lüpkes and Andreas Hein}, Title = {Towards uniform analyses of clinical cancer data in Germany}, Journal = {Gesundheit – gemeinsam. Kooperationstagung der GMDS, DGSMP, DGEpi, DGMS und der DGPH}, Year = {2024}, Month = {9}, Publisher = {German Medical Science GMS Publishing House}, Doi = {10.3205/24GMDS055}, type = {misc}, Abstract = {Introduction: Cancer registration in Germany is split into two perspectives: An epidemiological perspective reporting on the sociodemographic and geographic characteristics of cancer incidences and prevalences and, established in 2013 [1], a clinical perspective focusing on the diagnoses and treatments of cancer. Any event regarding a patient‘s vital status, diagnosis and progression, or treatment, is reported to the clinical cancer registry (CCR) of the federal state the event occurred in. Each CCR processes and consolidates these events differently, resulting in data sets that are not as uniform as one would like to believe. Nevertheless, the German Centre for Cancer Registry Data (ZfKD) at the Robert Koch Institute is tasked with nationwide analyses [1], having to integrate data sets of all CCRs. The oBDS-RKI format is the standard for data exchange between CCRs and the ZfKD [2]. Its hierarchical structure is great for data exchange but not made for analyses using the common data warehouse-based approach. This contribution therefore proposes the toolchain to create a uniform analysis model for nationwide in-depth analyses at the ZfKD. State of the art: Apart from proprietary in-house solutions of individual CCRs, three commercial systems are currently available for processing and storage of clinical cancer data in Germany: The Gießen Tumor Documentation System (GTDS) [3], a custom solution by IT-Choice Software AG [4], and the privacy-preserving record linkage tool CARELIS [5]. All of the above were originally designed to process individual cancer data events but not to integrate complete data sets of different CCRs. Concept: The general toolchain is based on the extract, transform, load process using two separate relational databases. During extraction, data sets from the CCRs are read and written into the staging database, which closely follows the structure of the oBDS-RKI format. During transformation, data from the staging database is processed for quality assurance, additional values are imputed, and then stored in the analysis database, which is modeled according the star schema, describing facts and dimensions. During loading, the analysis database may then be deployed to a data warehouse or used for generation of cancer reports. Implementation: The toolchain was implemented in C# and tailored towards the Microsoft software products in use at the ZfKD. The staging database was mostly generated from the XML schema defining the oBDS-RKI format. The analysis database was derived from well-established epidemiological analysis models in conjunction with clinical domain requirements from the oBDS-RKI format. To access the data warehouse, the analysis tool MUSTANG [5] is used. Lessons learned: While data sets were successfully integrated and processed, it is worth mentioning is the realization that the dimension tables of the analysis database must be regularly adapted and extended to meet the needs of the analysts, more so than the fact tables which are already well maintained due to the well documented domain model embedded in the oBDS-RKI format. In recent years, most work has been put into the data content represented by the fact tables but less into how dimensions tables must be curated and grouped for efficient analyses.} } @COMMENT{Bibtex file generated on }