dvc.yaml

stages:
  lyproxify:
    foreach:
      - folder: 2021-clb-oropharynx
        args: "--num-header-rows=1"
      - folder: 2023-clb-multisite
        args: "--num-header-rows=3 --drop-rows=439"
      - folder: 2023-isb-multisite
        args: "--num-header-rows=1"
    do:
      desc:
        Parse the `raw.csv` data using the `lyscripts` Python package and the
        `mapping.py` module to bring the original data into a format that the online
        interface LyProX understands.
      cmd: >
        lyscripts data lyproxify
        --input-file=${item.folder}/raw.csv
        --output-file=${item.folder}/data.csv
        --mapping-file=${item.folder}/mapping.py
        ${item.args}
      deps:
        - ${item.folder}/raw.csv
        - ${item.folder}/mapping.py
      outs:
        - ${item.folder}/data.csv:
            cache: false

  format-dates-insert-institution-and-remove-spaces:
    desc: |
      In the Vall d'Hebron dataset, dates are formatted as `dd/mm/YYYY`, which I will
      change to `YYYY-mm-dd`.
      Also, it contains the column `dataset`, which I will convert to `institution`
      and replace all entries with the name of the hospital.
      Next, the `hpv_status` column contains spaces instead of nothing when the
      information is missing. This causes pandas to interpret all entries of that
      column as strings instead of booleans.
      Then, the A,B,C behind the N-stage is removed, as this cannot be parsed by LyProX
      at the moment.
      Furthermore, the 111th patient has `N VI` in the (MRI, ipsi, VII) column. I
      assumed this should be FALSE. All this is fixed with a series of sed commands.
      Also, there are some occurences of `"C05,1"` instead of `C05.1`, which I will
      replace.
      Finally, it lowercases "(FE)MALE" and "OROPHARYNX"
    cmd:
      sed 's/\([0-9]\{2\}\)\/\([0-9]\{2\}\)\/\([0-9]\{4\}\)/\3-\2-\1/g' 2025-hvh-oropharynx/raw.csv |
        sed 's/,dataset,/,institution,/g' |
        sed "s/,2023_HVH_OROPHARYNX,/,Vall d'Hebron Barcelona Hospital,/g" |
        sed 's/,\s,/,,/g' |
        sed 's/,\([0-3]\)[A-C]*,/,\1,/g' |
        sed 's/,N\sVI,/,FALSE,/g' |
        sed 's/"C05,1"/C05.1/g' |
        sed 's/\(FEMALE\)/\L&/g' |
        sed 's/\(MALE\)/\L&/g' |
        sed 's/\(OROPHARYNX\)/\L&/g' > 2025-hvh-oropharynx/data.csv
    deps:
      - 2025-hvh-oropharynx/raw.csv
    outs:
      - 2025-hvh-oropharynx/data.csv:
          cache: false

  join:
    desc:
      Join together those datasets for which we want to create the showcase plot.
    deps:
      - 2021-clb-oropharynx/data.csv
      - 2023-isb-multisite/data.csv
      - 2023-clb-multisite/data.csv
    cmd: >
      lyscripts data join
      --inputs='[{"source": "2021-clb-oropharynx/data.csv"}, {"source": "2023-isb-multisite/data.csv"}, {"source": "2023-clb-multisite/data.csv"}]'
      --output=joined.csv
    outs:
      - joined.csv

  enhance:
    desc:
      This stage fixes the super- and sub-level reporting for the LNLs and also creates
      new 'modalities' from combining existing ones, e.g. using the logical AND or
      estimating the most likely involvement based on the observations only.
    cmd: >
      lyscripts data enhance
      --configs=modalities.ly.yaml
      --method=max_llh
      --input.source=joined.csv
      --output-file=enhanced.csv
    deps:
      - joined.csv
    params:
      - modalities.ly.yaml:
          - modalities
          - version
    outs:
      - enhanced.csv

  scenario-showcase:
    desc:
      Plot some involvement prevalences in the data for different scenarios, showcasing
      what is possible when data is reported in as much detail as we provide it.
    deps:
      - enhanced.csv
      - scripts/scenario_showcase.py
      - scripts/.mplstyle
    cmd: >
      python scripts/scenario_showcase.py
      --data enhanced.csv
      --mplstyle scripts/.mplstyle
    outs:
      - scenario_showcase.png

  description-and-docs:
    foreach:
      - 2021-clb-oropharynx
      - 2023-clb-multisite
      - 2023-isb-multisite
    do:
      desc:
        Generate the description of the `data.csv` columns from the `COLUMN_MAP` inside
        the `mapping.py` file, as well as the documentation of all the functions used
        in the `mapping.py` module. Insert these documentations into the
        `README.template` and create the `README.md` from it.
      deps:
        - scripts/render.py
        - ${item}/README.template
        - ${item}/mapping.py
        - ${item}/data.csv
      outs:
        - ${item}/README.md:
            cache: false
      cmd: >
        python scripts/render.py
        -m ${item}/mapping.py
        -d ${item}/data.csv
        -t ${item}/README.template
        -o ${item}/README.md

  plot-t-category:
    foreach:
      - 2021-usz-oropharynx
      - 2021-clb-oropharynx
      - 2023-clb-multisite
      - 2023-isb-multisite
      - 2025-hvh-oropharynx
    do:
      deps:
        - scripts/t_category.py
        - ${item}/data.csv
      cmd: >
        python scripts/t_category.py ${item}/data.csv

  plot-age-and-sex:
    foreach:
      - 2021-usz-oropharynx
      - 2021-clb-oropharynx
      - 2023-clb-multisite
      - 2023-isb-multisite
      - 2025-hvh-oropharynx
    do:
      deps:
        - scripts/age_and_sex.py
        - ${item}/data.csv
      cmd: >
        python scripts/age_and_sex.py ${item}/data.csv

  plot-subsite:
    foreach:
      - 2021-usz-oropharynx
      - 2021-clb-oropharynx
      - 2023-clb-multisite
      - 2023-isb-multisite
      - 2025-hvh-oropharynx
    do:
      deps:
        - scripts/subsite.py
        - ${item}/data.csv
      cmd: >
        python scripts/subsite.py --data ${item}/data.csv

  bar-plot:
    foreach:
      - 2021-clb-oropharynx
      - 2023-clb-multisite
      - 2023-isb-multisite
    do:
      deps:
        - scripts/bar_plot.py
        - ${item}/data.csv
      cmd:
        python scripts/bar_plot.py ${item}/data.csv

  factors-diff:
    matrix:
      first: [2021-usz-oropharynx]
      second: [2025-hvh-oropharynx]
    cmd: >
      python scripts/factors_diff.py
      --first-dataset ${item.first}
      --second-dataset ${item.second}
      --commit ee15c4cabdd160a10783fcf85ffe1cbfeb5c4826
      --repo rmnldwg/lydata.private
    deps:
      - ${item.first}/data.csv
      - ${item.second}/data.csv
      - scripts/factors_diff.py