@misc{wolters2024data, Author = {Timo Wolters and Klaas Dählmann and Joachim Hübner and Anja Groeneveld and Christian Lüpkes and Andreas Hein}, Title = {Data Quality in Clinical Cancer Registries: Imputation of Missing Values through Related Information}, Journal = {Gesundheit – gemeinsam. Kooperationstagung der GMDS, DGSMP, DGEpi, DGMS und der DGPH}, Year = {2024}, Month = {9}, Publisher = {Deutsche Gesellschaft für Medizinische Informatik, Biometrie und Epidemiologie}, Doi = {10.3205/24GMDS041}, type = {misc}, Abstract = {Introduction: Germany’s clinical cancer registries must ensure high quality data to fulfill their duties, e.g. to provide standardized documentation of all cancer cases, including treatment and course of disease [1], [2]. An essential part of the documentation is the TNM system which classifies the severity of a tumor using three components: the tumor size (T), lymph node metastasis (N) and distant metastasis (M). The UICC system combines those values into a single one. Both the TNM and UICC stage have a clinical (cTNM/cUICC) and a pathological (pTNM/pUICC) classification [3]. The dataset of the Agency for Clinical Cancer Data of Lower Saxony contains many tumors with erroneous or incomplete TNM and with UICC missing or merely provided as free-text. To improve data quality, we will apply five rules to correct or impute TNM and UICC at diagnosis, utilizing related information in the cancer registry data. Methods: ???The five rules are based on established relationships between data fields and are validated and well documented in the literature [2], [4], [5]. As additional input to the clinical/pathological TNM/UICC, we use the ICD code, topography, morphology, and metastases [3]. The rules are applied to the data available at time of diagnosis in the following order: 1. Mark TNM/UICC as 'not permitted' and abort if morphology or topography do not warrant a TNM [2]. 2. In case of benign tumors set TNM to T0N0M0 [4], else consolidate cTNM/pTNM into a single TNM, prioritizing pathological over clinical TNM components where possible [2]. 3. Correct common encoding mistakes within the calculated TNM. 4. Change the M-component from M0 to M1 if there is a synchronous metastasis [2]. 5. Calculate the UICC stage from the ICD code of the diagnosis and the calculated TNM, if applicable [5]. Results: To assess the performance of the rules, we first compared the calculated TNM/UICC with a subset of 126,823 tumors containing a complete cTNM/pTNM. The calculated TNM deviates from the cTNM/pTNM in about 5% of the cases. Moreover, 21 tumors of the subset have at least a cUICC/pUICC, which deviate in about 20% from the calculated UICC. Afterwards, we applied the rules to the full dataset of 324,231 tumors to calculate TNM and UICC. We were able to increase the number of tumors with complete TNM from 39.12% to 45.41%. For UICC, the number of tumors was increased from 0% to 45.40%. Conclusion: The calculated TNM is mostly in accordance with the subset of tumors with complete cTNM/pTNM. Regarding the UICC, the deviation between the calculated UICC and cUICC/pUICC in the subset is more prominent. However, due to the low examined number of only 21 tumors, this result should not be overemphasized. Considering the full dataset, we were able to slightly improve the number of tumors with complete TNM. Moreover, after a complete TNM was available, we were almost always able to calculate a UICC. For the future, we consider looking into the so-called additional classifications of the dataset, in which the UICC is often erroneously reported. However, these additional classifications are free-text and are thus more challenging to process.} } @COMMENT{Bibtex file generated on }