.. _tech-config:

Config file options
^^^^^^^^^^^^^^^^^^^

.. admonition:: Under construction

    This page is still under construction and will contain more details in the future.


The config file has the following options: see comments for explanations.
These defaults will always be used if not changed in the config file used during command invocation.

.. code-block:: yaml

    # Multiple arrays can be defined here, but arrays defined in global config saved in the cache are also available.
    #   This file will take precedence over the global config, unless the file names here can not be used.
    # Each array needs all required entries, but the `stemcnv-check make-staticdata` command will generate files 
    #   marked as auto-generatable. By default both the files and an update to a global array definition file will be
    #   written into the cache directory (unless --no-cache is used). By default this file is at 
    #   ~/.cache/stemcnv-check/global_array_definitions.yaml
    # Once the array definitions are in the global file, you need to either delete the 'array_definition' block here 
    #   or also update it with the information written out by `stemcnv-check make-staticdata` (which is the same as the 
    #   entry written into the global array definition config), since this config takes precedence over the global file.
    # If no global config was used during the `make-staticdata` run, i.e. due to the --no-cache flag the array definitions 
    #   will instead be written to a local file, i.e. 'ExampleArray_config.yaml' in the current working directory. 
    #   In this case you will need to copy the contents of that file into this one, or alternatively into a global array 
    #   definition file, that can still be created.
    array_definition:
      # This 'ExampleArray' *should* to be renamed to the actual array name
      ExampleArray:  
        genome_version: hg38               #REQUIRED, options: hg38/GRCh38, hg19/GRCh37
        # beadpool manifest file (.bpm) from Illumina, needs to match both the SNP array used and 
        # the desired genome version (usually filenames end with 'A1.bpm' for hg19 and 'A2.bpm' for hg38)
        bpm_manifest_file: ''              #REQUIRED
        # cluster file (.egt) from Illumina, matching the SNP array used, independent of genome version
        egt_cluster_file: ''               #REQUIRED
        # manifest file (.csv) from Illumina, matching the SNP array used and the genome version as .bpm
        csv_manifest_file: ''              #RECOMMENDED (can be left empty, but this will make most InDel probes unusable)
        # PennCNV pfb file, describing the SNPs (derived from vcf/manifest files)
        # defaults to: '{{cache}}/array_definitions/{{array_name}}/PennCNV-PFB_{{genome}}.pfb'
        penncnv_pfb_file: '__cache-default__'      #STATIC, Auto-generatable
        # PennCNV GC model file, containing GC content values, calculated by PennCNV
        # defaults to: '{{cache}}/array_definitions/{{array_name}}/PennCNV-GCmodel_{{genome}}.gcmodel'
        penncnv_GCmodel_file: '__cache-default__'  #STATIC, Auto-generatable
        # bed file with windows of very high array density, calculated by stemcnv-check
        # defaults to: '{{cache}}/array_definitions/{{array_name}}/density_{{genome}}.bed'
        array_density_file: '__cache-default__'    #STATIC, Auto-generatable
        # bed file with windows of probes gaps on the array
        # defaults to: '{{cache}}/array_definitions/{{array_name}}/gaps_{{genome}}.bed'
        array_gaps_file: '__cache-default__'       #STATIC, Auto-generatable
        
    # Folder in which raw data files (.idat) can be found
    # Important! idat files should be grouped in a subfolder per array-chip (sentrix_name)
    raw_data_folder: '' #REQUIRED, Note: gencall has a hard time following links
    # Output folder, where stemcnv-check will write results
    data_path: data     #REQUIRED
    # Output folder, where stemcnv-check will write log files
    log_path: logs      #REQUIRED

    evaluation_settings:
      # All CNV calls are given a label based on their check score, filters and reference match.
      # The labels described here are always available, but can be changed or new labels can be added
      # If not other category fits (which should not occur with default settings), 
      #  then the last defined "Exclude call" label will always be assigned
      # Possible values for the "not_allowed_vcf_filters" list are: {vcf_filters}
      CNV_call_labels:
        Critical de-novo:
          minimum_check_score: 55
          not_allowed_vcf_filters: ['high_probe_dens', 'probe_gap', 'min_size', 'min_probes', 'min_density']
          reference_match: FALSE
        Reportable de-novo:
          minimum_check_score: 55
          not_allowed_vcf_filters: ['min_size', 'min_probes', 'min_density']
          reference_match: FALSE
        de-novo call:
          minimum_check_score: 0
          not_allowed_vcf_filters: ['min_size', 'min_probes', 'min_density']
          reference_match: FALSE
        Reference genotype:
          minimum_check_score: 0
          not_allowed_vcf_filters: []
          reference_match: TRUE
        Excluded call:
          minimum_check_score: 0
          not_allowed_vcf_filters: []
          reference_match: FALSE
      
      # Each sample QC measure defined in StemCNV-check is categorised as one of: {sample_labels_names}
      # The last two categories are mutually exclusive, and the last one is only used for specific measures (defined by the 'use_last_level' list).
      # In the report the color codes for the categories are: {sample_labels_values}
      # For each of the sample QC measures, two thresholds for maximum values are defined:
      # These determine the transition from the 1st to 2nd, or the 2nd to 3rd/last category.
      # If both thresholds are the same value, the 2nd category is skipped and the 1st and 3rd/last are directly adjacent.
      summary_stat_warning_levels:   
        call_rate: [0.99, 0.99] #Note: callrate uses *minimum* thresholds, not maximum
        # SNP_pairwise_distance_to_reference is the absolute GT distance between a sample and it's reference
        # Note that the expected baseline difference strongly depends on the array platform
        # and may need to be adjusted. These values are based on the GSA array (~700k probes)
        SNP_pairwise_distance_to_reference: [500, 5000]
        loss_gain_log2ratio: [2, 4]
        total_calls_CNV: [10, 50]
        total_calls_LOH: [30, 75]
        reportable_calls_CNV: [5, 10]
        reportable_calls_LOH: [5, 10]
        critical_calls_CNV: [1, 1]
        critical_calls_LOH: [1, 1]
        reportable_SNVs: [5, 10]
        critical_SNVs: [1, 1]
        # CNVs/LOHs gievn one these labels are not counted for QC measures
        # Possible labels include the (default) CNV_call_labels defined above, as well as additional labels
        # Default labels: {CNV_labels}
        call_count_excl_labels: ['Excluded call'] # Fully ignore calls with any of these labels 
        # These measures use last QC category and are also bolded in the html summary table
        use_last_level:
          - call_rate
          - computed_gender
          - SNP_pairwise_distance_to_reference
          - critical_SNVs
          - critical_calls_CNV
          - critical_calls_LOH
          
      collate_output:
        # xlsx or tsv output files can be generated
        file_format: xlsx
        # These columns from the sampletable will be included in the collated summary overview table
        summary_extra_sampletable_cols: 
          - Reference_Sample
        # Selection of CNVs for the summary table based on call labels
        cnv_collate_call_selection:
          # If defined, only CNVs with one of the "whitelist" call labels will be included
          # If defined, no CNVs with one of the "blacklist" call labels will be included
          # Possible labels include the (default) CNV_call_labels defined above, as well as additional labels
          # Default labels: {CNV_labels}
          whitelist_call_label: []
          blacklist_call_label:
              - Excluded call
    
    
    global_settings:
      # By default all conda environments and apptainer images are stored to a common cache
      # This default location can also be overwritten by the '--cache-path' cmd-line flag or disabled by '--no-cache'
      cache_dir: '~/.cache/stemcnv-check'
      # Mehari transcript database file, either '__cache-default__' or a path for the bin.zst database file
      # defaults to "{{cache_dir}}/mehari-db/mehari-data-txs-{{genome}}-ensembl-{mehari_db_version}.bin.zst
      hg19_mehari_transcript_db: '__cache-default__'
      hg38_mehari_transcript_db: '__cache-default__' 
      # Dosage sensitivity predicitions, as described in Collins et. al. 2022 (doi:10.1016/j.cell.2022.06.036)
      # Either '__cache-default__' or a path to the dosage sensitivity data file
      # defaults to "{{cache_dir}}/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz"
      dosage_sensitivity_scores: '__cache-default__'
      # Fasta file for the genome sequence, either '__default-ensemble__' or a path to the genome fasta file
      # '__default-ensemble__' will download the genome fasta file from ensembl ftp servers
      # Note: fasta files can be compressed, but *only* with bgzip!
      # defaults to "{{cache_dir}}/fasta/homo_sapiens/{ensembl_release}_{{genome}}/Homo_sapiens.{{genome}}.dna.primary_assembly.fa.gz"
      hg19_genome_fasta: '__default-ensemble__'
      hg38_genome_fasta: '__default-ensemble__'
      # Gene annotation of the genome in gtf format, either '__default-gencode__' (Gencode v45 files) or a path to the gtf file
      # defaults to "{{cache_dir}}/static-data/gencode.{{genome}}.v45.gtf.gz"
      hg19_gtf_file: '__default-gencode__'
      hg38_gtf_file: '__default-gencode__'
      # tabular files with chromosome and gband details, derived from UCSC information via make-staticdata
      # defaults to "{{cache_dir}}/static-data/UCSC_{{genome}}_chromosome-info.tsv"
      hg19_genomeInfo_file: '__default-UCSC__'
      hg38_genomeInfo_file: '__default-UCSC__'
    
    
    settings:
      # Select tools to use
      # Currently implemented tools (=valid options): PennCNV, CBS
      CNV.calling.tools:
        - PennCNV
        - CBS
      probe_filter_sets:
        # Each section here defines a set of SNP probe filters
        # each set can be applied to individual or all steps of the pipeline, but using only one set is recommended
        # SNP probes filters are applied as (soft) filters to the SNP vcf file.
        # - GenTrainScore: Illumina score on clustering on probe intensities, usually stable between samples (& partially chips)
        # - GenCallScore: Illumina score on Genotype call reliability, usually somewhat stable between samples
        # - Position.duplicates: many SNP arrays have some genomic positions covered with multiple probes. Multiple data 
        #                        points at the same position are problematic for CNV calling due to signal/noise issues.
        #                        These probes can all be kept, all removed, or a single probe per position with highest GC|GT can be kept
        # - Pseudoautosomal: Handling probes in the pseudo-autosomal (PAR1, PAR2) and X-translocated (XTR) regions on the X and Y chromosomes
        #                    These regions are identical or very similar between X and Y and always behave as if diploid,
        #                    which can cause issues on haploid male samples. They can also be generally more problematic to interpret.
        # Additionally:
        # - SNPs on the Y chromosome are always (soft)filtered for female samples
        # - SNPs without properly defined REF & ALT alleles are hard-filtered (i.e. removed from the vcf). 
        #   The latter mainly occurs if the manifest csv is omitted, which causes Indel-probes to be improperly defined.
        #
        # We recommend to use these filter settings:
        standard:
          GenTrainScore: 0.15
          GenCallScore:  0.15
          Position.duplicates: highest-GenCall # keep|remove|highest-GenCall|highest-GenTrain
          Pseudoautosomal: remove-male # keep|remove|remove-male
    
      # Default filter set to use for all tools
      default_probe_filter_set: standard
    
      PennCNV:
        # Specific probe filter set for PennCNV, '_default_' uses `default_probe_filter_set
        probe_filter_settings: '_default_'
        enable_LOH_calls: True
        # Neighbouring CNVs of the same state that are merged if
        # a) the gap between them is <= 'merge.gap.absolute' [bp] or <= 'merge.gap.snps' [SNPs] or if 
        # b) they would touch/overlap after increasing their size each by 'call.extension.percent' [%]
        # Any chain of neighbouring CNVs meeting these conditions becomes a single call
        call.merging:
          merge.gap.absolute: 500
          merge.gap.snps: 10
          call.extension.percent: 60
          maximum.gap.allowed: 500000
        # vcf filters / CNV call filters are applied to calls (after merging of nearby calls) as follows:
        # [snps] >= min.snp & [length] >= min.length & [density, snps/Mb] >= min.snp.density
        filter.minprobes: 5
        filter.minlength: 1000
        filter.mindensity.Mb: 10 #snps per Mb
    
      CBS:
        # Specific probe filter set for CBS, '_default_' uses `default_probe_filter_set
        probe_filter_settings: '_default_'
        # undo.SD split value for CBS
        undo.SD.val: 1
        # Neighbouring CNVs of the same state that are merged if
        # a) the gap between them is <= 'merge.distance' [bp] or <= 'merge.gap.snps' [SNPs] or if 
        # b) they would touch/overlap after increasing their size each by 'call.extension.percent' [%]
        # Any chain of neighbouring CNVs meeting these conditions becomes a single call
        call.merging:
          merge.gap.absolute: 500
          merge.gap.snps: 10
          call.extension.percent: 60
          maximum.gap.allowed: 500000
        # vcf filters / CNV call filters are applied to calls (after merging of nearby calls) as follows:
        # [snps] >= min.snp & [length] >= min.length & [density, snps/Mb] >= min.snp.density
        filter.minprobes: 5
        filter.minlength: 1000
        filter.mindensity.Mb: 10 #snps per Mb
            
        # LRR thresholds for identifying CBS segments as gain/loss on autosomes
        LRR.loss: -0.25      #CN1
        LRR.loss.large: -1.1 #CN0
        LRR.gain: 0.2        #CN3
        LRR.gain.large: 0.75 #CN4+
        # LRR thresholds for sex chromosomes
        LRR.male.XorY.loss:      -0.5   #CN0
        LRR.male.XorY.gain:       0.28  #CN2
        LRR.male.XorY.gain.large: 0.75  #CN3+
        LRR.female.X.loss:       -0.05  #CN1
        LRR.female.XX.loss:      -0.9   #CN0
        LRR.female.X.gain:        0.5   #CN3
        LRR.female.X.gain.large:  1.05  #CN4+
    
      # Values used by `stemcnv-check make-staticdata` to generate density and gap bed files
      array_attribute_summary:
        density.windows: 100000    #window size for probe density calculation (100kb)
        min.gap.size: 'auto-array' #minimum distance between 2 probes to be considered a gap. Number or 'auto-array'
    
      CNV_processing:
        call_processing:
          # SNP probe counts may change with merging of calls from different tools
          # therefore a single probe_filter_settings needs to be used as reference here
          probe_filter_settings: '_default_'
          # Prefiltering of calls is done (after merging of nearby calls) as follows:
          # vcf filters / CNV call filters are applied to calls (after merging of nearby calls) as follows:
          filter.minprobes: 5
          filter.minlength: 1000
          filter.mindensity.Mb: 10 #snps per Mb
          
          ## Calls from multiple tools are combined if they match
          # This is the minimum coverage the largest single call in an combined group needs to have.
          # keep this >=50 to prevent formation/acceptance of chains of overlapping calls
          tool.overlap.greatest.call.min.perc: 50
          # This is minimum for the median of coverage percentages from all tool in any merged group
          tool.overlap.min.cov.sum.perc: 60
          ## Reference comparison
          min.reciprocal.coverage.with.ref: 50
          ## Probe gap flagging of calls
          # Values to determine 'call_has_probe_gap' based on coverage percentage with gap areas (from array attribute
          # summary) and log2 number of unique probe positions. The two values represent slope and intercept of
          # slope * percent_gap_area + gap_intercept ~ log2(uniq_snp_positions), calls above that line "have gaps"
          # These defaults mean that calls with larger % gap area need fewer unique probes to be flagged as "having a gap"
          # Specifically, calls with 33% gap need >=373 probes, 50% >=91 probes, 75% >= 12 probes, 85% >= 5 probes to be flagged
          gap_area.uniq_probes.rel: [-12, 12.5] # slope, intercept
          min.perc.gap_area: 0.33
          ## HighDensity flagging of calls
          # Calls that have a probe density which is higher than the top {{density.quantile.cutoff}} [%] of the array windows
          # (calculated from array attribute summary) are flagged as having "high SNP density"
          density.quantile.cutoff: 0.99

        gene_overlap:
          # These options determine which genes are read from the gtf file
          exclude_gene_type_regex: []
          # Example: ['artifact', 'IG_.*', 'TR_.*', '(un|_)processed_pseudogene']
          include_only_these_gene_types: ['lncRNA', 'miRNA', 'protein_coding']
          whitelist_hotspot_genes: True
          # These genelists are used to mark genes with high impact
          # Gene lists files are tabular (tsv) and need the following columns:
          # list_name, hotspot, mapping, call_type, check_score, description, description_doi
          # list_name, hotspot, mapping, call_type & check_score need to be filled out
          # description & description_doi will be used to display extra info in the report
          # mapping can be 'gene_name', 'gband', and 'position' and should describe the hotspot
          # call_type can be 'any', 'gain', 'loss' or 'LOH'
          stemcell_hotspot_list: '__inbuilt__/supplemental-files/genelist-stemcell-hotspots.tsv'
          cancer_gene_list: '__inbuilt__/supplemental-files/genelist-cancer-drivers.tsv'
          # also available: '__inbuilt__/supplemental-files/genelist-cancer-hotspots.tsv'
          # File path for dosage sensitivity score file is defined in global_settings
          dosage_sensitive_gene_name_fixes: '__inbuilt__/supplemental-files/gene-names-mapping-dosage-sensitivity.tsv'
    
        # Scoring for CNV and LOH calls
        # scoring combines a Size based contribution with scores for overlapping annotated regions
        Check_score_values:
          # stemcell_hotspot & cancer_gene scores need to be defined in the respective tables
          # CNVs/LOHs get the summed scored of each overlapping annotated gene or region (gband/position)
          # genes are only scored _once_ per call, i.e. a gene with both stemcell_hotspot and cancer_gene match will only 
          # contribute the higher of the two annotated scores. 
          
          # Dosage sensivity predicition is a based on Collins et. al. 2022 (doi:10.1016/j.cell.2022.06.036)
          # CNV loss calls overlapping a gene with pHaplo score >= threshold are scored with the 'dosage_sensitive_gene' score
          # CNV gain calls are respectively scored for the pTriplo score
          pHaplo_threshold: 0.86
          pTriplo_threshold: 0.94
          dosage_sensitive_gene: 5
          # Genes without any score from the hotspot lists or dosage sensivity are scored as 'any_other_gene'
          any_other_gene: 0.2
          # These values determine how the base Check-Score is calculated from size & CN.
          # The formula used is: copy_factor * log(size) * log(size) - flat_decrease
          # The copy_factor changes based on the CN of the call (number of lost/gained copies)
          # copy_factor for CN 1 and 3
          single_copy_factor: 0.333
          # copy_factor for CN 0 and 4
          double_copy_factor: 0.5
          # copy_factor for CN 2 (LOH)
          neutral_copy_factor: 0.275
          flat_decrease: 15      
          # Note: male sex chromosomes have baseline CN=1, and generally use the 1-copy factor unless CN>2
        # This file contains precision estimations based on benchmarking data
        precision_estimation_file: '__inbuilt__/supplemental-files/precision_estimates.tsv'
    
      SNV_analysis:
        probe_filter_settings: "_default_" # "_default_", Filterset name, or "none". Note: None will even include chrY on female samples
        snv_hotspot_table: '__inbuilt__/supplemental-files/SNV-stemcell-hotspots.tsv'
        flag_GenCall_minimum: 0.2
        # Only variants matching at least one of the following criteria are included in SNV analysis 
        # This means that i.e. intron or synonymous variants in SNV hotspot genes are NOT included in the output files 
        # Underlying annotations are derived from mehari, specifically from the terms defined by http://www.sequenceontology.org 
        variant_selection:
          Impact: [HIGH, MODERATE]      
          Annotation_regex: ~
          # This will include ALL variants in any ROI region, regardless of annotation
          include_all_ROI_overlaps: TRUE
        # List of SNV categories that are considered critical or reportable.
        # Allowed values: {SNV_category_labels}
        critical_SNV:
          - 'hotspot-match'
        reportable_SNV:
          - 'hotspot-gene'
          - 'protein-ablation'
        # SNVs that can fully remove protein function are summarised in the "protein-ablation" category (generally HIGH impact)
        protein_ablation_annotations:
          # The 'HIGH' impact category generally contains these variant annotations/groups:      
          # - stop_gained
          # - start_lost
          # - stop_lost
          # - frameshift_variant
          # - splice_acceptor_variant
          # - splice_donor_variant  
          Impact: ['HIGH']
          Annotation_regex: ~
        # SNVs impacting protein sequence, but not generally removing protein function are summarised as "protein-changing"
        protein_change_annotations:
          Impact: []
          # The missense_variant and (conservative|disruptive)_inframe_(deletion|insertion) annotations are in the 'MODERATE' impact category
          Annotation_regex: 'missense_variant|inframe'
        
        # These settings determine which samples are used for the SNP clustering & dendrogram
        SNP_clustering:
            # Sample-IDs from the sample table, these will be added to the clustering of every sample
            sample_ids: []
            # Column names of the sample table, these are assumed to contain (comma separated) Sample-IDs
            id_columns: []
            # Column names of the sample table, Samples are used for clustering if they have the same value in any of these columns
            match_columns: ['Chip_Name', 'Sample_Group']
            # Maximum number of samples to include in the dendrogram. Note: calculation of clustering (done per sample)
            # takes more time for each additional sample included
            max_number_samples: 20
    
      vcf_output:
        # Which chromosome style to use in the vcf file ("1" vs "chr1")  
        chrom_style: 'UCSC' # "keep-original", UCSC, or NCBI / Ensembl
    

    reports:
      # Any number of reports can be defined, the default is 'StemCNV-check-report'
      # All reports inherit from the default settings, but can overwrite specific parts
      StemCNV-check-report:
        file_type: 'html'         #REQUIRED

    #   Any number of reports can be defined, the default is 'StemCNV-check-report'.
    #   file_type (html or pdf) needs to be defined for each one.
    #   Note: report generation is optimised for html format, and pdf reports may have issues, especially with larger tables
    #
    #  StemCNV-check-full-report:
    #    file_type: 'html'         #REQUIRED
    #    call.data.and.plots:
    #      _default_:
    #        # How many plots to show at least, this strongly influences the filesize of the report
    #        min_number_plots: 100
    #        include.gene.table.details: 'All'
    
    #  # These reduced settings works reasonably well for pdf
    #  StemCNV-check-report-pdf:
    #    file_type: 'pdf'          #REQUIRED
    #    exclude_sections: [ QC.settings, QC.PennCNV, QC.CBS, QC.GenCall ]
    #    call.data.and.plots:
    #      include.call.table: FALSE
    
      #These are the default settings from which all reports inherit
      _default_:
    
        # individual sections can be included (whitelist) or excluded (blacklist) from report.
        # Default is special '__all__' for include, but a list of specific sections can also be used
        include_sections: '__all__'
        exclude_sections: []
        # Availbale sections (Note that tool specific ones also depend on pipeline settings):
        # {report_sections}
    
        # (Additional) List of columns from the sample_table that are included in the "Sample Information" table
        sample.info.extra.cols: ['Chip_Name', 'Chip_Pos']
        
        # CNV calls can, based on the assigned call label, be:
        # - fully removed from the report, incl all tables and plots (this option)
        # - selected for the de-novo & reference genotype CNV tables (following section)
        # - selected for the genome_overview plots (last section)
        # Possible labels include the (default) CNV_call_labels defined above, as well as additional labels
        # Default labels: {CNV_labels}
        # Call labels for the 'de-novo CNV calls' table
        CNV_call_labels_removed: 
          - 'Excluded call'
          
        call.data.and.plots:
          # Default and specific settings for each section of plots (denovo, reference_gt, regions_of_interest)
          # The specific sections inherit from the default, but can overwrite all or individual values
          _default_: &default_plot_settings
            # How many plots to fully incorporate into the report at minimum
            # Note: Plots are still generated for all CNV calls, but any exceeding this number will only be saved
            # separate from the html report and linked from there. Increasing this number increases the report file size.
            min_number_plots: 20
            # Calls with one of these call lables will be included regardless of the minimum number
            always_include_CNVs: []
            # Include plots, table of individual calls and table of genes
            include.plot: True
            include.hotspot.table: True
            include.gene.table.details: 'Call' # Choice of: None|Call|All
            # Minimum relative size of (each) flanking region compared to call
            plot.flanking.region.relative: 2
            # Minimum size of total plot region
            plot.region.minsize: 2000000
          denovo:
            <<: *default_plot_settings
            # Call labels for the 'de-novo CNV calls' table
            call_labels_include:
              - 'Critical de-novo'
              - 'Reportable de-novo'
              - 'de-novo call'
            always_include_CNVs:
              - 'Critical de-novo'
              - 'Reportable de-novo' 
          reference_gt:
            <<: *default_plot_settings
            # Call labels for the 'Reference genotype CNV calls' table
            call_labels_include:
              - Reference genotype
          regions_of_interest:
            <<: *default_plot_settings
            plot.region.minsize: 100000
    
    #     # Report settings for the SNV analysis block
    #     SNV_analysis:
    #       # Which critical SNV reasons should use red (instead of orange) highlights  
    #       SNV_categories_with_red_highlight:
    #         - 'ROI-match'
    #         - 'hotspot-match'
    
        # Settings for the Sample comparison / SNP dendrogram sections
        SNP_comparison:
          # Selection of sample table columns, to determine shape and color of the samples in the dendrogram.
          # Note: You can also use any column from the sample table, incl optional ones you added yourself
          dendrogram.color.by: 'Chip_Name'
          dendrogram.shape.by: 'Sample_Group'
          
        genome_overview:
            # Call labels for the overview plots
            call_labels_overview:
              - 'Critical de-novo'
              - 'Reportable de-novo'
              - 'de-novo call'
              - 'Reference genotype'
            # Include the reference sample in the genome overview plots
            show_reference: True

    # These constraints define which sample_ids, sentrix_pos (Chip_Pos) and sentrix_name (Chip_Name) are valid
    # Edit at your own risk!: if sample_ids to not match this constraint, they will not be run and errors might not be intuitive
    wildcard_constraints:
      sample_id: "[a-zA-Z0-9-_]+"
      sentrix_pos: 'R[0-9]{2}C[0-9]{2}'
      sentrix_name: '[0-9]+'
    
    # These settings are used to define the resources snakemake allocates for each tool
    tools:
      _default_:
        threads: 1
        memory: 2000 # "2000MB"
        runtime: "1h"
        partition: 'medium'
      GenCall:
        threads: 4
        memory: 8000 # "8000MB"
        runtime: "4h"
    #   gtc2vcf:
    #     memory: 1000 # "2000MB"
    #     runtime: "1h"
    #   filter_snp_vcf:
    #     memory: 1000 # "2000MB"
    #     runtime: "1h"
    #   mehari:
    #     memory: 1000 # "2000MB"
    #     runtime: "1h"
      CBS:
        memory: 4000 # "4000MB"
        runtime: "30m"  
      CNV.process:
        memory: 4000 # "4000MB"
        runtime: "30m"
      PennCNV:
        memory: 1000 # "500MB"
        runtime: "30m"
      SNV_analysis:
        threads: 2
        memory: 20000 # "2000MB"  
        runtime: "4h"
      knitr:
        memory: 10000 # "10000MB"
        runtime: "1h"